diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index adfb96041c5c0..d19495c3abad3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -3479,6 +3479,37 @@ bool TargetLowering::SimplifyDemandedVectorElts( break; } + case ISD::LOAD: { + auto *Ld = cast(Op); + if (!ISD::isNormalLoad(Ld) || !Ld->isSimple()) + break; + + // TODO: Handle arbitrary vector extract for isMask + if (DemandedElts.popcount() != 1) + break; + + EVT VT = Ld->getValueType(0); + if (TLO.LegalOperations() && + !isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) + break; + + EVT EltVT = VT.getVectorElementType(); + SDLoc DL(Ld); + + unsigned Idx = DemandedElts.countTrailingZeros(); + + SDValue IdxVal = TLO.DAG.getVectorIdxConstant(Idx, DL); + SDValue Scalarized = + scalarizeExtractedVectorLoad(EltVT, DL, VT, IdxVal, Ld, TLO.DAG); + if (!Scalarized) + break; + + TLO.DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Scalarized.getValue(1)); + + SDValue Insert = TLO.DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, + TLO.DAG.getUNDEF(VT), Scalarized, IdxVal); + return TLO.CombineTo(Op, Insert); + } case ISD::VECTOR_SHUFFLE: { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); diff --git a/llvm/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll b/llvm/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll index f5aa4c666a568..e9a4a83a40683 100644 --- a/llvm/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll +++ b/llvm/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll @@ -30,7 +30,7 @@ define void @test_i64_v2f32(ptr %p, ptr %q) { ; CHECK: ld1 { v{{[0-9]+}}.2s } ; CHECK: rev64 v{{[0-9]+}}.2s ; CHECK: str - %1 = load <2 x float>, ptr %p + %1 = load volatile <2 x float>, ptr %p %2 = fadd <2 x float> %1, %1 %3 = bitcast <2 x float> %2 to i64 %4 = add i64 %3, %3 @@ -43,7 +43,7 @@ define void @test_i64_v2i32(ptr %p, ptr %q) { ; CHECK: ld1 { v{{[0-9]+}}.2s } ; CHECK: rev64 v{{[0-9]+}}.2s ; CHECK: str - %1 = load <2 x i32>, ptr %p + %1 = load volatile <2 x i32>, ptr %p %2 = add <2 x i32> %1, %1 %3 = bitcast <2 x i32> %2 to i64 %4 = add i64 %3, %3 @@ -121,7 +121,7 @@ define void @test_f64_v2f32(ptr %p, ptr %q) { ; CHECK: ld1 { v{{[0-9]+}}.2s } ; CHECK: rev64 v{{[0-9]+}}.2s ; CHECK: str - %1 = load <2 x float>, ptr %p + %1 = load volatile <2 x float>, ptr %p %2 = fadd <2 x float> %1, %1 %3 = bitcast <2 x float> %2 to double %4 = fadd double %3, %3 @@ -134,7 +134,7 @@ define void @test_f64_v2i32(ptr %p, ptr %q) { ; CHECK: ld1 { v{{[0-9]+}}.2s } ; CHECK: rev64 v{{[0-9]+}}.2s ; CHECK: str - %1 = load <2 x i32>, ptr %p + %1 = load volatile <2 x i32>, ptr %p %2 = add <2 x i32> %1, %1 %3 = bitcast <2 x i32> %2 to double %4 = fadd double %3, %3 @@ -213,7 +213,7 @@ define void @test_v1i64_v2f32(ptr %p, ptr %q) { ; CHECK: ld1 { v{{[0-9]+}}.2s } ; CHECK: rev64 v{{[0-9]+}}.2s ; CHECK: str - %1 = load <2 x float>, ptr %p + %1 = load volatile <2 x float>, ptr %p %2 = fadd <2 x float> %1, %1 %3 = bitcast <2 x float> %2 to <1 x i64> %4 = add <1 x i64> %3, %3 @@ -226,7 +226,7 @@ define void @test_v1i64_v2i32(ptr %p, ptr %q) { ; CHECK: ld1 { v{{[0-9]+}}.2s } ; CHECK: rev64 v{{[0-9]+}}.2s ; CHECK: str - %1 = load <2 x i32>, ptr %p + %1 = load volatile <2 x i32>, ptr %p %2 = add <2 x i32> %1, %1 %3 = bitcast <2 x i32> %2 to <1 x i64> %4 = add <1 x i64> %3, %3 @@ -318,7 +318,7 @@ define void @test_v2f32_v1i64(ptr %p, ptr %q) { define void @test_v2f32_v2i32(ptr %p, ptr %q) { ; CHECK: ld1 { v{{[0-9]+}}.2s } ; CHECK: st1 { v{{[0-9]+}}.2s } - %1 = load <2 x i32>, ptr %p + %1 = load volatile <2 x i32>, ptr %p %2 = add <2 x i32> %1, %1 %3 = bitcast <2 x i32> %2 to <2 x float> %4 = fadd <2 x float> %3, %3 @@ -410,7 +410,7 @@ define void @test_v2i32_v1i64(ptr %p, ptr %q) { define void @test_v2i32_v2f32(ptr %p, ptr %q) { ; CHECK: ld1 { v{{[0-9]+}}.2s } ; CHECK: st1 { v{{[0-9]+}}.2s } - %1 = load <2 x float>, ptr %p + %1 = load volatile <2 x float>, ptr %p %2 = fadd <2 x float> %1, %1 %3 = bitcast <2 x float> %2 to <2 x i32> %4 = add <2 x i32> %3, %3 @@ -488,7 +488,7 @@ define void @test_v4i16_v2f32(ptr %p, ptr %q) { ; CHECK: ld1 { v{{[0-9]+}}.2s } ; CHECK: rev32 v{{[0-9]+}}.4h ; CHECK: st1 { v{{[0-9]+}}.4h } - %1 = load <2 x float>, ptr %p + %1 = load volatile <2 x float>, ptr %p %2 = fadd <2 x float> %1, %1 %3 = bitcast <2 x float> %2 to <4 x i16> %4 = add <4 x i16> %3, %3 @@ -501,7 +501,7 @@ define void @test_v4i16_v2i32(ptr %p, ptr %q) { ; CHECK: ld1 { v{{[0-9]+}}.2s } ; CHECK: rev32 v{{[0-9]+}}.4h ; CHECK: st1 { v{{[0-9]+}}.4h } - %1 = load <2 x i32>, ptr %p + %1 = load volatile <2 x i32>, ptr %p %2 = add <2 x i32> %1, %1 %3 = bitcast <2 x i32> %2 to <4 x i16> %4 = add <4 x i16> %3, %3 @@ -587,7 +587,7 @@ define void @test_v4f16_v2f32(ptr %p, ptr %q) { ; CHECK: fadd ; CHECK-NOT: rev ; CHECK: st1 { v{{[0-9]+}}.4h } - %1 = load <2 x float>, ptr %p + %1 = load volatile <2 x float>, ptr %p %2 = fadd <2 x float> %1, %1 %3 = bitcast <2 x float> %2 to <4 x half> %4 = fadd <4 x half> %3, %3 @@ -602,7 +602,7 @@ define void @test_v4f16_v2i32(ptr %p, ptr %q) { ; CHECK: fadd ; CHECK-NOT: rev ; CHECK: st1 { v{{[0-9]+}}.4h } - %1 = load <2 x i32>, ptr %p + %1 = load volatile <2 x i32>, ptr %p %2 = add <2 x i32> %1, %1 %3 = bitcast <2 x i32> %2 to <4 x half> %4 = fadd <4 x half> %3, %3 @@ -682,7 +682,7 @@ define void @test_v8i8_v2f32(ptr %p, ptr %q) { ; CHECK: ld1 { v{{[0-9]+}}.2s } ; CHECK: rev32 v{{[0-9]+}}.8b ; CHECK: st1 { v{{[0-9]+}}.8b } - %1 = load <2 x float>, ptr %p + %1 = load volatile <2 x float>, ptr %p %2 = fadd <2 x float> %1, %1 %3 = bitcast <2 x float> %2 to <8 x i8> %4 = add <8 x i8> %3, %3 @@ -695,7 +695,7 @@ define void @test_v8i8_v2i32(ptr %p, ptr %q) { ; CHECK: ld1 { v{{[0-9]+}}.2s } ; CHECK: rev32 v{{[0-9]+}}.8b ; CHECK: st1 { v{{[0-9]+}}.8b } - %1 = load <2 x i32>, ptr %p + %1 = load volatile <2 x i32>, ptr %p %2 = add <2 x i32> %1, %1 %3 = bitcast <2 x i32> %2 to <8 x i8> %4 = add <8 x i8> %3, %3 @@ -721,7 +721,7 @@ define void @test_f128_v2f64(ptr %p, ptr %q) { ; CHECK: ld1 { v{{[0-9]+}}.2d } ; CHECK: ext ; CHECK: str - %1 = load <2 x double>, ptr %p + %1 = load volatile <2 x double>, ptr %p %2 = fadd <2 x double> %1, %1 %3 = bitcast <2 x double> %2 to fp128 %4 = fadd fp128 %3, %3 @@ -734,7 +734,7 @@ define void @test_f128_v2i64(ptr %p, ptr %q) { ; CHECK: ld1 { v{{[0-9]+}}.2d } ; CHECK: ext ; CHECK: str - %1 = load <2 x i64>, ptr %p + %1 = load volatile <2 x i64>, ptr %p %2 = add <2 x i64> %1, %1 %3 = bitcast <2 x i64> %2 to fp128 %4 = fadd fp128 %3, %3 @@ -816,7 +816,7 @@ define void @test_v2f64_f128(ptr %p, ptr %q) { define void @test_v2f64_v2i64(ptr %p, ptr %q) { ; CHECK: ld1 { v{{[0-9]+}}.2d } ; CHECK: st1 { v{{[0-9]+}}.2d } - %1 = load <2 x i64>, ptr %p + %1 = load volatile <2 x i64>, ptr %p %2 = add <2 x i64> %1, %1 %3 = bitcast <2 x i64> %2 to <2 x double> %4 = fadd <2 x double> %3, %3 @@ -895,7 +895,7 @@ define void @test_v2i64_f128(ptr %p, ptr %q) { define void @test_v2i64_v2f64(ptr %p, ptr %q) { ; CHECK: ld1 { v{{[0-9]+}}.2d } ; CHECK: st1 { v{{[0-9]+}}.2d } - %1 = load <2 x double>, ptr %p + %1 = load volatile <2 x double>, ptr %p %2 = fadd <2 x double> %1, %1 %3 = bitcast <2 x double> %2 to <2 x i64> %4 = add <2 x i64> %3, %3 @@ -979,7 +979,7 @@ define void @test_v4f32_v2f64(ptr %p, ptr %q) { ; CHECK: rev64 v{{[0-9]+}}.4s ; CHECK-NOT: rev ; CHECK: st1 { v{{[0-9]+}}.4s } - %1 = load <2 x double>, ptr %p + %1 = load volatile <2 x double>, ptr %p %2 = fadd <2 x double> %1, %1 %3 = bitcast <2 x double> %2 to <4 x float> %4 = fadd <4 x float> %3, %3 @@ -994,7 +994,7 @@ define void @test_v4f32_v2i64(ptr %p, ptr %q) { ; CHECK: fadd ; CHECK-NOT: rev ; CHECK: st1 { v{{[0-9]+}}.4s } - %1 = load <2 x i64>, ptr %p + %1 = load volatile <2 x i64>, ptr %p %2 = add <2 x i64> %1, %1 %3 = bitcast <2 x i64> %2 to <4 x float> %4 = fadd <4 x float> %3, %3 @@ -1062,7 +1062,7 @@ define void @test_v4i32_v2f64(ptr %p, ptr %q) { ; CHECK: ld1 { v{{[0-9]+}}.2d } ; CHECK: rev64 v{{[0-9]+}}.4s ; CHECK: st1 { v{{[0-9]+}}.4s } - %1 = load <2 x double>, ptr %p + %1 = load volatile <2 x double>, ptr %p %2 = fadd <2 x double> %1, %1 %3 = bitcast <2 x double> %2 to <4 x i32> %4 = add <4 x i32> %3, %3 @@ -1075,7 +1075,7 @@ define void @test_v4i32_v2i64(ptr %p, ptr %q) { ; CHECK: ld1 { v{{[0-9]+}}.2d } ; CHECK: rev64 v{{[0-9]+}}.4s ; CHECK: st1 { v{{[0-9]+}}.4s } - %1 = load <2 x i64>, ptr %p + %1 = load volatile <2 x i64>, ptr %p %2 = add <2 x i64> %1, %1 %3 = bitcast <2 x i64> %2 to <4 x i32> %4 = add <4 x i32> %3, %3 @@ -1141,7 +1141,7 @@ define void @test_v8i16_v2f64(ptr %p, ptr %q) { ; CHECK: ld1 { v{{[0-9]+}}.2d } ; CHECK: rev64 v{{[0-9]+}}.8h ; CHECK: st1 { v{{[0-9]+}}.8h } - %1 = load <2 x double>, ptr %p + %1 = load volatile <2 x double>, ptr %p %2 = fadd <2 x double> %1, %1 %3 = bitcast <2 x double> %2 to <8 x i16> %4 = add <8 x i16> %3, %3 @@ -1154,7 +1154,7 @@ define void @test_v8i16_v2i64(ptr %p, ptr %q) { ; CHECK: ld1 { v{{[0-9]+}}.2d } ; CHECK: rev64 v{{[0-9]+}}.8h ; CHECK: st1 { v{{[0-9]+}}.8h } - %1 = load <2 x i64>, ptr %p + %1 = load volatile <2 x i64>, ptr %p %2 = add <2 x i64> %1, %1 %3 = bitcast <2 x i64> %2 to <8 x i16> %4 = add <8 x i16> %3, %3 @@ -1234,7 +1234,7 @@ define void @test_v16i8_v2f64(ptr %p, ptr %q) { ; CHECK: ld1 { v{{[0-9]+}}.2d } ; CHECK: rev64 v{{[0-9]+}}.16b ; CHECK: st1 { v{{[0-9]+}}.16b } - %1 = load <2 x double>, ptr %p + %1 = load volatile <2 x double>, ptr %p %2 = fadd <2 x double> %1, %1 %3 = bitcast <2 x double> %2 to <16 x i8> %4 = add <16 x i8> %3, %3 @@ -1247,7 +1247,7 @@ define void @test_v16i8_v2i64(ptr %p, ptr %q) { ; CHECK: ld1 { v{{[0-9]+}}.2d } ; CHECK: rev64 v{{[0-9]+}}.16b ; CHECK: st1 { v{{[0-9]+}}.16b } - %1 = load <2 x i64>, ptr %p + %1 = load volatile <2 x i64>, ptr %p %2 = add <2 x i64> %1, %1 %3 = bitcast <2 x i64> %2 to <16 x i8> %4 = add <16 x i8> %3, %3 @@ -1315,7 +1315,7 @@ define %struct.struct1 @test_v4f16_struct(ptr %ret) { entry: ; CHECK: ld1 { {{v[0-9]+}}.4h } ; CHECK-NOT: rev - %0 = load <4 x half>, ptr %ret, align 2 + %0 = load volatile <4 x half>, ptr %ret, align 2 %1 = extractelement <4 x half> %0, i32 0 %.fca.0.insert = insertvalue %struct.struct1 undef, half %1, 0 ret %struct.struct1 %.fca.0.insert diff --git a/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll b/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll index d76e817e62a49..ce657aa1f0b5b 100644 --- a/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll +++ b/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll @@ -27,10 +27,7 @@ define i64 @g(ptr %p) { ; CHECK-LABEL: g: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0, #8] -; CHECK-NEXT: add x9, x8, x8 -; CHECK-NEXT: add x8, x9, x8 -; CHECK-NEXT: sub x0, x8, x8 +; CHECK-NEXT: mov x0, xzr ; CHECK-NEXT: ret %vec = load <2 x i64>, ptr %p, align 1 %elt = extractelement <2 x i64> %vec, i32 1 diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll index 66f26fc9d8597..d39e537edb786 100644 --- a/llvm/test/CodeGen/AArch64/fcmp.ll +++ b/llvm/test/CodeGen/AArch64/fcmp.ll @@ -679,28 +679,27 @@ define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double> ; CHECK-SD-NEXT: .cfi_def_cfa_offset 160 ; CHECK-SD-NEXT: .cfi_offset w30, -16 ; CHECK-SD-NEXT: stp q2, q5, [sp, #112] // 32-byte Folded Spill +; CHECK-SD-NEXT: add x8, sp, #176 ; CHECK-SD-NEXT: // kill: def $d6 killed $d6 def $q6 ; CHECK-SD-NEXT: // kill: def $d7 killed $d7 def $q7 -; CHECK-SD-NEXT: ldr d5, [sp, #184] -; CHECK-SD-NEXT: str q3, [sp, #64] // 16-byte Folded Spill -; CHECK-SD-NEXT: ldp d3, d2, [sp, #168] +; CHECK-SD-NEXT: str q3, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldp d3, d2, [sp, #160] ; CHECK-SD-NEXT: mov v6.d[1], v7.d[0] ; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-SD-NEXT: mov v0.16b, v1.16b ; CHECK-SD-NEXT: mov v1.16b, v4.16b -; CHECK-SD-NEXT: str q5, [sp, #96] // 16-byte Folded Spill -; CHECK-SD-NEXT: ldr d5, [sp, #160] -; CHECK-SD-NEXT: mov v3.d[1], v2.d[0] -; CHECK-SD-NEXT: str q5, [sp, #80] // 16-byte Folded Spill -; CHECK-SD-NEXT: stp q6, q3, [sp, #32] // 32-byte Folded Spill +; CHECK-SD-NEXT: ld1 { v2.d }[1], [x8] +; CHECK-SD-NEXT: stp q6, q3, [sp, #80] // 32-byte Folded Spill +; CHECK-SD-NEXT: str q2, [sp, #48] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldr d2, [sp, #184] +; CHECK-SD-NEXT: str q2, [sp, #64] // 16-byte Folded Spill ; CHECK-SD-NEXT: bl __lttf2 ; CHECK-SD-NEXT: cmp w0, #0 -; CHECK-SD-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload ; CHECK-SD-NEXT: cset w8, lt ; CHECK-SD-NEXT: sbfx x8, x8, #0, #1 ; CHECK-SD-NEXT: fmov d0, x8 ; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldp q0, q1, [sp, #16] // 32-byte Folded Reload ; CHECK-SD-NEXT: bl __lttf2 ; CHECK-SD-NEXT: cmp w0, #0 ; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload @@ -708,19 +707,19 @@ define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double> ; CHECK-SD-NEXT: sbfx x8, x8, #0, #1 ; CHECK-SD-NEXT: fmov d1, x8 ; CHECK-SD-NEXT: mov v1.d[1], v0.d[0] -; CHECK-SD-NEXT: str q1, [sp, #64] // 16-byte Folded Spill +; CHECK-SD-NEXT: str q1, [sp, #32] // 16-byte Folded Spill ; CHECK-SD-NEXT: ldp q0, q1, [sp, #112] // 32-byte Folded Reload ; CHECK-SD-NEXT: bl __lttf2 -; CHECK-SD-NEXT: ldp q1, q0, [sp, #32] // 32-byte Folded Reload +; CHECK-SD-NEXT: ldp q0, q3, [sp, #80] // 32-byte Folded Reload ; CHECK-SD-NEXT: cmp w0, #0 -; CHECK-SD-NEXT: ldp q2, q4, [sp, #64] // 32-byte Folded Reload +; CHECK-SD-NEXT: ldp q2, q1, [sp, #32] // 32-byte Folded Reload ; CHECK-SD-NEXT: cset w8, lt ; CHECK-SD-NEXT: sbfx x8, x8, #0, #1 -; CHECK-SD-NEXT: ldr q3, [sp, #96] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr q4, [sp, #64] // 16-byte Folded Reload ; CHECK-SD-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload -; CHECK-SD-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-SD-NEXT: fmov d2, x8 -; CHECK-SD-NEXT: bsl v2.16b, v4.16b, v3.16b +; CHECK-SD-NEXT: bsl v2.16b, v3.16b, v4.16b ; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 @@ -815,20 +814,20 @@ define <3 x double> @v3f64_double(<3 x double> %a, <3 x double> %b, <3 x double> ; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-SD-NEXT: // kill: def $d6 killed $d6 def $q6 ; CHECK-SD-NEXT: // kill: def $d7 killed $d7 def $q7 +; CHECK-SD-NEXT: add x8, sp, #16 ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-SD-NEXT: // kill: def $d5 killed $d5 def $q5 -; CHECK-SD-NEXT: ldr d16, [sp, #24] -; CHECK-SD-NEXT: ldr d17, [sp] ; CHECK-SD-NEXT: mov v3.d[1], v4.d[0] ; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] ; CHECK-SD-NEXT: mov v6.d[1], v7.d[0] -; CHECK-SD-NEXT: ldp d1, d4, [sp, #8] ; CHECK-SD-NEXT: fcmgt v2.2d, v5.2d, v2.2d -; CHECK-SD-NEXT: mov v1.d[1], v4.d[0] ; CHECK-SD-NEXT: fcmgt v0.2d, v3.2d, v0.2d -; CHECK-SD-NEXT: bsl v2.16b, v17.16b, v16.16b -; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-SD-NEXT: ldp d3, d1, [sp] +; CHECK-SD-NEXT: ld1 { v1.d }[1], [x8] ; CHECK-SD-NEXT: bsl v0.16b, v6.16b, v1.16b +; CHECK-SD-NEXT: ldr d1, [sp, #24] +; CHECK-SD-NEXT: bsl v2.16b, v3.16b, v1.16b +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 ; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 diff --git a/llvm/test/CodeGen/AArch64/fmlal-loreg.ll b/llvm/test/CodeGen/AArch64/fmlal-loreg.ll index 31ead890ba8ac..ed22243eeef45 100644 --- a/llvm/test/CodeGen/AArch64/fmlal-loreg.ll +++ b/llvm/test/CodeGen/AArch64/fmlal-loreg.ll @@ -45,11 +45,11 @@ define void @loop(ptr %out_tile, ptr %lhs_panel, ptr %rhs_panel, i32 noundef %K, ; CHECK-NEXT: mov w8, w3 ; CHECK-NEXT: .LBB1_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr q2, [x1], #2 +; CHECK-NEXT: ldr q2, [x2], #2 ; CHECK-NEXT: subs x8, x8, #1 -; CHECK-NEXT: ldr q3, [x2], #2 -; CHECK-NEXT: fmlal v0.4s, v3.4h, v2.h[0] -; CHECK-NEXT: fmlal2 v1.4s, v3.4h, v2.h[0] +; CHECK-NEXT: ld1r { v3.8h }, [x1], #2 +; CHECK-NEXT: fmlal v0.4s, v2.4h, v3.4h +; CHECK-NEXT: fmlal2 v1.4s, v2.4h, v3.4h ; CHECK-NEXT: b.ne .LBB1_1 ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup ; CHECK-NEXT: stp q0, q1, [x0] diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll index e284795760c5c..f586647439d25 100644 --- a/llvm/test/CodeGen/AArch64/icmp.ll +++ b/llvm/test/CodeGen/AArch64/icmp.ll @@ -1123,30 +1123,29 @@ entry: define <3 x i64> @v3i64_i64(<3 x i64> %a, <3 x i64> %b, <3 x i64> %d, <3 x i64> %e) { ; CHECK-SD-LABEL: v3i64_i64: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: // kill: def $d4 killed $d4 def $q4 ; CHECK-SD-NEXT: // kill: def $d3 killed $d3 def $q3 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: // kill: def $d4 killed $d4 def $q4 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-SD-NEXT: // kill: def $d6 killed $d6 def $q6 ; CHECK-SD-NEXT: // kill: def $d7 killed $d7 def $q7 +; CHECK-SD-NEXT: add x8, sp, #16 ; CHECK-SD-NEXT: // kill: def $d5 killed $d5 def $q5 ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-SD-NEXT: ldr d16, [sp, #24] -; CHECK-SD-NEXT: ldr d17, [sp] ; CHECK-SD-NEXT: mov v3.d[1], v4.d[0] ; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] ; CHECK-SD-NEXT: mov v6.d[1], v7.d[0] -; CHECK-SD-NEXT: ldp d1, d4, [sp, #8] -; CHECK-SD-NEXT: mov v1.d[1], v4.d[0] +; CHECK-SD-NEXT: ldp d4, d1, [sp] +; CHECK-SD-NEXT: ld1 { v1.d }[1], [x8] ; CHECK-SD-NEXT: cmgt v0.2d, v3.2d, v0.2d ; CHECK-SD-NEXT: bsl v0.16b, v6.16b, v1.16b ; CHECK-SD-NEXT: cmgt v1.2d, v5.2d, v2.2d -; CHECK-SD-NEXT: mov v2.16b, v1.16b +; CHECK-SD-NEXT: ldr d2, [sp, #24] +; CHECK-SD-NEXT: bit v2.16b, v4.16b, v1.16b ; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-SD-NEXT: bsl v2.16b, v17.16b, v16.16b ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: v3i64_i64: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll index ad4efeaf39247..1e6427c4cd495 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll @@ -33,10 +33,7 @@ define half @extractelement_v8f16(<8 x half> %op1) vscale_range(2,0) #0 { define half @extractelement_v16f16(ptr %a) vscale_range(2,0) #0 { ; CHECK-LABEL: extractelement_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: mov z0.h, z0.h[15] -; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 +; CHECK-NEXT: ldr h0, [x0, #30] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %r = extractelement <16 x half> %op1, i64 15 @@ -44,22 +41,10 @@ define half @extractelement_v16f16(ptr %a) vscale_range(2,0) #0 { } define half @extractelement_v32f16(ptr %a) #0 { -; VBITS_GE_256-LABEL: extractelement_v32f16: -; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 -; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: mov z0.h, z0.h[15] -; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 killed $z0 -; VBITS_GE_256-NEXT: ret -; -; VBITS_GE_512-LABEL: extractelement_v32f16: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl32 -; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_512-NEXT: mov z0.h, z0.h[31] -; VBITS_GE_512-NEXT: // kill: def $h0 killed $h0 killed $z0 -; VBITS_GE_512-NEXT: ret +; CHECK-LABEL: extractelement_v32f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, #62] +; CHECK-NEXT: ret %op1 = load <32 x half>, ptr %a %r = extractelement <32 x half> %op1, i64 31 ret half %r @@ -68,11 +53,7 @@ define half @extractelement_v32f16(ptr %a) #0 { define half @extractelement_v64f16(ptr %a) vscale_range(8,0) #0 { ; CHECK-LABEL: extractelement_v64f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: mov w8, #63 // =0x3f -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: whilels p0.h, xzr, x8 -; CHECK-NEXT: lastb h0, p0, z0.h +; CHECK-NEXT: ldr h0, [x0, #126] ; CHECK-NEXT: ret %op1 = load <64 x half>, ptr %a %r = extractelement <64 x half> %op1, i64 63 @@ -82,11 +63,7 @@ define half @extractelement_v64f16(ptr %a) vscale_range(8,0) #0 { define half @extractelement_v128f16(ptr %a) vscale_range(16,0) #0 { ; CHECK-LABEL: extractelement_v128f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl128 -; CHECK-NEXT: mov w8, #127 // =0x7f -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: whilels p0.h, xzr, x8 -; CHECK-NEXT: lastb h0, p0, z0.h +; CHECK-NEXT: ldr h0, [x0, #254] ; CHECK-NEXT: ret %op1 = load <128 x half>, ptr %a %r = extractelement <128 x half> %op1, i64 127 @@ -117,10 +94,7 @@ define float @extractelement_v4f32(<4 x float> %op1) vscale_range(2,0) #0 { define float @extractelement_v8f32(ptr %a) vscale_range(2,0) #0 { ; CHECK-LABEL: extractelement_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: mov z0.s, z0.s[7] -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 +; CHECK-NEXT: ldr s0, [x0, #28] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %r = extractelement <8 x float> %op1, i64 7 @@ -128,22 +102,10 @@ define float @extractelement_v8f32(ptr %a) vscale_range(2,0) #0 { } define float @extractelement_v16f32(ptr %a) #0 { -; VBITS_GE_256-LABEL: extractelement_v16f32: -; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: mov z0.s, z0.s[7] -; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 killed $z0 -; VBITS_GE_256-NEXT: ret -; -; VBITS_GE_512-LABEL: extractelement_v16f32: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: mov z0.s, z0.s[15] -; VBITS_GE_512-NEXT: // kill: def $s0 killed $s0 killed $z0 -; VBITS_GE_512-NEXT: ret +; CHECK-LABEL: extractelement_v16f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x0, #60] +; CHECK-NEXT: ret %op1 = load <16 x float>, ptr %a %r = extractelement <16 x float> %op1, i64 15 ret float %r @@ -152,11 +114,7 @@ define float @extractelement_v16f32(ptr %a) #0 { define float @extractelement_v32f32(ptr %a) vscale_range(8,0) #0 { ; CHECK-LABEL: extractelement_v32f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: mov w8, #31 // =0x1f -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: whilels p0.s, xzr, x8 -; CHECK-NEXT: lastb s0, p0, z0.s +; CHECK-NEXT: ldr s0, [x0, #124] ; CHECK-NEXT: ret %op1 = load <32 x float>, ptr %a %r = extractelement <32 x float> %op1, i64 31 @@ -166,11 +124,7 @@ define float @extractelement_v32f32(ptr %a) vscale_range(8,0) #0 { define float @extractelement_v64f32(ptr %a) vscale_range(16,0) #0 { ; CHECK-LABEL: extractelement_v64f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl64 -; CHECK-NEXT: mov w8, #63 // =0x3f -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: whilels p0.s, xzr, x8 -; CHECK-NEXT: lastb s0, p0, z0.s +; CHECK-NEXT: ldr s0, [x0, #252] ; CHECK-NEXT: ret %op1 = load <64 x float>, ptr %a %r = extractelement <64 x float> %op1, i64 63 @@ -199,10 +153,7 @@ define double @extractelement_v2f64(<2 x double> %op1) vscale_range(2,0) #0 { define double @extractelement_v4f64(ptr %a) vscale_range(2,0) #0 { ; CHECK-LABEL: extractelement_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: mov z0.d, z0.d[3] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ldr d0, [x0, #24] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %r = extractelement <4 x double> %op1, i64 3 @@ -210,22 +161,10 @@ define double @extractelement_v4f64(ptr %a) vscale_range(2,0) #0 { } define double @extractelement_v8f64(ptr %a) #0 { -; VBITS_GE_256-LABEL: extractelement_v8f64: -; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.d, vl4 -; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: mov z0.d, z0.d[3] -; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0 -; VBITS_GE_256-NEXT: ret -; -; VBITS_GE_512-LABEL: extractelement_v8f64: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_512-NEXT: mov z0.d, z0.d[7] -; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0 -; VBITS_GE_512-NEXT: ret +; CHECK-LABEL: extractelement_v8f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0, #56] +; CHECK-NEXT: ret %op1 = load <8 x double>, ptr %a %r = extractelement <8 x double> %op1, i64 7 ret double %r @@ -234,11 +173,7 @@ define double @extractelement_v8f64(ptr %a) #0 { define double @extractelement_v16f64(ptr %a) vscale_range(8,0) #0 { ; CHECK-LABEL: extractelement_v16f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: mov w8, #15 // =0xf -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: whilels p0.d, xzr, x8 -; CHECK-NEXT: lastb d0, p0, z0.d +; CHECK-NEXT: ldr d0, [x0, #120] ; CHECK-NEXT: ret %op1 = load <16 x double>, ptr %a %r = extractelement <16 x double> %op1, i64 15 @@ -248,11 +183,7 @@ define double @extractelement_v16f64(ptr %a) vscale_range(8,0) #0 { define double @extractelement_v32f64(ptr %a) vscale_range(16,0) #0 { ; CHECK-LABEL: extractelement_v32f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl32 -; CHECK-NEXT: mov w8, #31 // =0x1f -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: whilels p0.d, xzr, x8 -; CHECK-NEXT: lastb d0, p0, z0.d +; CHECK-NEXT: ldr d0, [x0, #248] ; CHECK-NEXT: ret %op1 = load <32 x double>, ptr %a %r = extractelement <32 x double> %op1, i64 31 @@ -260,3 +191,6 @@ define double @extractelement_v32f64(ptr %a) vscale_range(16,0) #0 { } attributes #0 = { "target-features"="+sve" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; VBITS_GE_256: {{.*}} +; VBITS_GE_512: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll index 27e95489f8ad7..5233d292c4eaf 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -438,8 +438,7 @@ define void @masked_gather_v32i32(ptr %a, ptr %b) vscale_range(16,0) #0 { define void @masked_gather_v1i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_gather_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: ldr x8, [x0] ; CHECK-NEXT: // implicit-def: $d0 ; CHECK-NEXT: cbnz x8, .LBB15_2 ; CHECK-NEXT: // %bb.1: // %cond.load diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll index e3e06dcdf17f3..5af3a88c711bd 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -415,13 +415,13 @@ define void @masked_scatter_v32i32(ptr %a, ptr %b) vscale_range(16,0) #0 { define void @masked_scatter_v1i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_scatter_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: ldr x8, [x0] ; CHECK-NEXT: cbnz x8, .LBB15_2 ; CHECK-NEXT: // %bb.1: // %cond.store -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: str d0, [x8] +; CHECK-NEXT: ldr d0, [x1] +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: str d0, [x9] ; CHECK-NEXT: .LBB15_2: // %else ; CHECK-NEXT: ret %vals = load <1 x i64>, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll index cf308e6c4395f..f0e5fa6e03090 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll @@ -71,18 +71,12 @@ define half @extractelement_v8f16(<8 x half> %op1) { define half @extractelement_v16f16(ptr %a) { ; CHECK-LABEL: extractelement_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: mov z0.h, z0.h[7] -; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 +; CHECK-NEXT: ldr h0, [x0, #30] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: extractelement_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldr h0, [x0, #30] ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %r = extractelement <16 x half> %op1, i64 15 @@ -131,18 +125,12 @@ define float @extractelement_v4f32(<4 x float> %op1) { define float @extractelement_v8f32(ptr %a) { ; CHECK-LABEL: extractelement_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: mov z0.s, z0.s[3] -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 +; CHECK-NEXT: ldr s0, [x0, #28] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: extractelement_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldr s0, [x0, #28] ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %r = extractelement <8 x float> %op1, i64 7 @@ -182,18 +170,12 @@ define double @extractelement_v2f64(<2 x double> %op1) { define double @extractelement_v4f64(ptr %a) { ; CHECK-LABEL: extractelement_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: mov z0.d, z0.d[1] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: ldr d0, [x0, #24] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: extractelement_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldr d0, [x0, #24] ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %r = extractelement <4 x double> %op1, i64 3 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll index fab45c9dc3bc3..6ffe8c5fab29f 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll @@ -880,44 +880,43 @@ define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_load_dword s6, s[4:5], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: s_load_dword s4, s[4:5], 0xe +; SI-NEXT: s_brev_b32 s5, -2 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_bfi_b32 v0, s5, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_f32_fptrunc_f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 -; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dword s3, s[4:5], 0x38 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: s_brev_b32 s4, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_brev_b32 s0, -2 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_bfi_b32 v2, s0, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_bfi_b32 v2, s4, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f32_fptrunc_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s1 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x38 +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s3, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm %sign.trunc = fptrunc double %sign to float %result = call float @llvm.copysign.f32(float %mag, float %sign.trunc) diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll index 5f75a2f29a026..ad126bb22b583 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll @@ -13,14 +13,14 @@ define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32], ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x1d -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: s_load_dword s4, s[4:5], 0x1e +; SI-NEXT: s_brev_b32 s5, -2 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_bfi_b32 v1, s4, v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_bfi_b32 v1, s5, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -28,32 +28,32 @@ define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32], ; VI-LABEL: s_test_copysign_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c -; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x74 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-NEXT: s_load_dword s6, s[4:5], 0x78 +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_brev_b32 s4, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_bfi_b32 v1, s2, v0, v1 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_bfi_b32 v1, s4, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x74 -; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x4c -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x78 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double %mag, double %sign) store double %result, ptr addrspace(1) %out, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll b/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll index 6c921441c972d..ea2fe9d620893 100644 --- a/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll +++ b/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll @@ -19,31 +19,30 @@ define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) % ; FORWARDXNACK-LABEL: shuffle_v4f16_234u: ; FORWARDXNACK: ; %bb.0: ; FORWARDXNACK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FORWARDXNACK-NEXT: global_load_dword v6, v[0:1], off offset:4 -; FORWARDXNACK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; FORWARDXNACK-NEXT: global_load_dword v4, v[0:1], off offset:4 +; FORWARDXNACK-NEXT: global_load_dword v5, v[2:3], off ; FORWARDXNACK-NEXT: s_waitcnt vmcnt(1) -; FORWARDXNACK-NEXT: v_mov_b32_e32 v0, v6 +; FORWARDXNACK-NEXT: v_mov_b32_e32 v0, v4 ; FORWARDXNACK-NEXT: s_waitcnt vmcnt(0) -; FORWARDXNACK-NEXT: v_mov_b32_e32 v1, v4 +; FORWARDXNACK-NEXT: v_mov_b32_e32 v1, v5 ; FORWARDXNACK-NEXT: s_setpc_b64 s[30:31] ; ; REVERSEXNACK-LABEL: shuffle_v4f16_234u: ; REVERSEXNACK: ; %bb.0: ; REVERSEXNACK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; REVERSEXNACK-NEXT: v_mov_b32_e32 v6, v1 -; REVERSEXNACK-NEXT: v_mov_b32_e32 v5, v0 -; REVERSEXNACK-NEXT: v_mov_b32_e32 v4, v3 -; REVERSEXNACK-NEXT: v_mov_b32_e32 v3, v2 -; REVERSEXNACK-NEXT: global_load_dword v0, v[5:6], off offset:4 -; REVERSEXNACK-NEXT: global_load_dwordx2 v[1:2], v[3:4], off +; REVERSEXNACK-NEXT: global_load_dword v5, v[0:1], off offset:4 +; REVERSEXNACK-NEXT: global_load_dword v4, v[2:3], off +; REVERSEXNACK-NEXT: s_waitcnt vmcnt(1) +; REVERSEXNACK-NEXT: v_mov_b32_e32 v0, v5 ; REVERSEXNACK-NEXT: s_waitcnt vmcnt(0) +; REVERSEXNACK-NEXT: v_mov_b32_e32 v1, v4 ; REVERSEXNACK-NEXT: s_setpc_b64 s[30:31] ; ; NOXNACK-LABEL: shuffle_v4f16_234u: ; NOXNACK: ; %bb.0: ; NOXNACK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; NOXNACK-NEXT: global_load_dword v0, v[0:1], off offset:4 -; NOXNACK-NEXT: global_load_dwordx2 v[1:2], v[2:3], off +; NOXNACK-NEXT: global_load_dword v1, v[2:3], off ; NOXNACK-NEXT: s_waitcnt vmcnt(0) ; NOXNACK-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, ptr addrspace(1) %arg0 diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 5dff660912e40..a656ce2fa9d71 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -34,7 +34,7 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; CHECK-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane ; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: s_load_dwordx4 s[28:31], s[4:5], 0x0 +; CHECK-NEXT: s_load_dword s22, s[4:5], 0x0 ; CHECK-NEXT: s_movk_i32 s20, 0x130 ; CHECK-NEXT: s_mov_b32 s21, s24 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) @@ -55,7 +55,7 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_mov_b32 s20, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: v_writelane_b32 v7, s49, 13 -; CHECK-NEXT: v_mov_b32_e32 v2, s28 +; CHECK-NEXT: v_mov_b32_e32 v2, s22 ; CHECK-NEXT: v_mov_b32_e32 v3, v1 ; CHECK-NEXT: s_mov_b32 s21, s20 ; CHECK-NEXT: s_mov_b32 s22, s20 @@ -318,8 +318,8 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_mov_b64 s[16:17], s[40:41] ; CHECK-NEXT: s_mov_b64 s[18:19], s[42:43] ; CHECK-NEXT: ; kill: killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 -; CHECK-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11 ; CHECK-NEXT: ; kill: killed $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59 +; CHECK-NEXT: ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_sub_f32_e32 v1, v4, v3 ; CHECK-NEXT: v_mul_f32_e32 v0, v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll index 3d27b5fe7f30b..5b96fb06afbbf 100644 --- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll @@ -112,10 +112,10 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; GFX8V4-LABEL: llvm_amdgcn_is_shared: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x40 -; GFX8V4-NEXT: s_load_dword s1, s[8:9], 0x4 +; GFX8V4-NEXT: s_load_dword s0, s[8:9], 0x4 +; GFX8V4-NEXT: s_load_dword s1, s[6:7], 0x40 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V4-NEXT: s_cmp_eq_u32 s0, s1 ; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX8V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX8V4-NEXT: flat_store_dword v[0:1], v0 @@ -124,10 +124,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { ; ; GFX8V5-LABEL: llvm_amdgcn_is_shared: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xcc -; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0x4 +; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0x4 +; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0xcc ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V5-NEXT: s_cmp_eq_u32 s0, s1 ; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX8V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX8V5-NEXT: flat_store_dword v[0:1], v0 @@ -166,10 +166,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) { define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; GFX8V4-LABEL: llvm_amdgcn_is_private: ; GFX8V4: ; %bb.0: -; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x44 -; GFX8V4-NEXT: s_load_dword s1, s[8:9], 0x4 +; GFX8V4-NEXT: s_load_dword s0, s[8:9], 0x4 +; GFX8V4-NEXT: s_load_dword s1, s[6:7], 0x44 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V4-NEXT: s_cmp_eq_u32 s0, s1 ; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX8V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX8V4-NEXT: flat_store_dword v[0:1], v0 @@ -178,10 +178,10 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) { ; ; GFX8V5-LABEL: llvm_amdgcn_is_private: ; GFX8V5: ; %bb.0: -; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xc8 -; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0x4 +; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0x4 +; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0xc8 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V5-NEXT: s_cmp_eq_u32 s0, s1 ; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX8V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX8V5-NEXT: flat_store_dword v[0:1], v0 diff --git a/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll b/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll index 363d568f9c11c..f06175d1adaec 100644 --- a/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll +++ b/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll @@ -16,7 +16,7 @@ define amdgpu_ps float @nonuniform_uniform(i32 %arg18) { .entry: %tmp31 = sext i32 %arg18 to i64 %tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) @indexable, i64 0, i64 %tmp31 - %tmp33 = load <3 x float>, ptr addrspace(1) %tmp32, align 16 + %tmp33 = load volatile <3 x float>, ptr addrspace(1) %tmp32, align 16 %tmp34 = extractelement <3 x float> %tmp33, i32 0 ret float %tmp34 } @@ -31,7 +31,7 @@ define amdgpu_ps float @uniform_nonuniform(i32 inreg %offset, i32 %arg18) { %tmp1 = zext i32 %arg18 to i64 %tmp2 = inttoptr i64 %tmp1 to ptr addrspace(1) %tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) %tmp2, i32 0, i32 %offset - %tmp33 = load <3 x float>, ptr addrspace(1) %tmp32, align 16 + %tmp33 = load volatile <3 x float>, ptr addrspace(1) %tmp32, align 16 %tmp34 = extractelement <3 x float> %tmp33, i32 0 ret float %tmp34 } @@ -46,7 +46,7 @@ define amdgpu_ps float @const_nonuniform(i32 %arg18) { %tmp1 = zext i32 %arg18 to i64 %tmp2 = inttoptr i64 %tmp1 to ptr addrspace(1) %tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) %tmp2, i32 0, i32 1 - %tmp33 = load <3 x float>, ptr addrspace(1) %tmp32, align 16 + %tmp33 = load volatile <3 x float>, ptr addrspace(1) %tmp32, align 16 %tmp34 = extractelement <3 x float> %tmp33, i32 0 ret float %tmp34 } @@ -61,7 +61,7 @@ define amdgpu_ps float @nonuniform_nonuniform(i32 %offset, i32 %arg18) { %tmp1 = zext i32 %arg18 to i64 %tmp2 = inttoptr i64 %tmp1 to ptr addrspace(1) %tmp32 = getelementptr [6 x <3 x float>], ptr addrspace(1) %tmp2, i32 0, i32 %offset - %tmp33 = load <3 x float>, ptr addrspace(1) %tmp32, align 16 + %tmp33 = load volatile <3 x float>, ptr addrspace(1) %tmp32, align 16 %tmp34 = extractelement <3 x float> %tmp33, i32 0 ret float %tmp34 } diff --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll index db802732e987b..87a21a46eaff5 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc.ll @@ -93,8 +93,8 @@ define amdgpu_kernel void @sgpr_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %a) { } ; GCN-LABEL: {{^}}s_trunc_i64_to_i1: -; SI: s_load_dwordx2 s[[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x13 -; VI: s_load_dwordx2 s[[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x4c +; SI: s_load_dword s[[SLO:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 +; VI: s_load_dword s[[SLO:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c ; GCN: s_bitcmp1_b32 s[[SLO]], 0 ; GCN: s_cselect_b32 {{s[0-9]+}}, 63, -12 define amdgpu_kernel void @s_trunc_i64_to_i1(ptr addrspace(1) %out, [8 x i32], i64 %x) { diff --git a/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll b/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll index b079a94b5fcc3..fc0740ab2693e 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s @@ -5,31 +6,31 @@ define <2 x i8> @shuffle_v2i8_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v2i8_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v2i8_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b16 v0, 8, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b16 v0, 8, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v2i8_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b16 v0, 8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b16 v0, 8, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <2 x i8>, ptr addrspace(1) %arg0 %val1 = shufflevector <2 x i8> %val0, <2 x i8> poison, <2 x i32> @@ -39,37 +40,37 @@ entry: define <4 x i8> @shuffle_v4i8_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v4i8_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4i8_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4i8_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <4 x i8>, ptr addrspace(1) %arg0 %val1 = shufflevector <4 x i8> %val0, <4 x i8> poison, <4 x i32> @@ -79,49 +80,49 @@ entry: define <8 x i8> @shuffle_v8i8_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v8i8_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v8i8_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_mov_b32_e32 v7, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_mov_b32_e32 v7, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v8i8_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: v_mov_b32_e32 v7, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, v0 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: v_mov_b32_e32 v7, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <8 x i8>, ptr addrspace(1) %arg0 %val1 = shufflevector <8 x i8> %val0, <8 x i8> poison, <8 x i32> @@ -131,73 +132,73 @@ entry: define <16 x i8> @shuffle_v16i8_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v16i8_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_mov_b32_e32 v9, v0 -; GFX9-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-NEXT: v_mov_b32_e32 v11, v0 -; GFX9-NEXT: v_mov_b32_e32 v12, v0 -; GFX9-NEXT: v_mov_b32_e32 v13, v0 -; GFX9-NEXT: v_mov_b32_e32 v14, v0 -; GFX9-NEXT: v_mov_b32_e32 v15, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-NEXT: v_mov_b32_e32 v11, v0 +; GFX9-NEXT: v_mov_b32_e32 v12, v0 +; GFX9-NEXT: v_mov_b32_e32 v13, v0 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v15, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v16i8_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_mov_b32_e32 v7, v0 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_mov_b32_e32 v11, v0 -; GFX10-NEXT: v_mov_b32_e32 v12, v0 -; GFX10-NEXT: v_mov_b32_e32 v13, v0 -; GFX10-NEXT: v_mov_b32_e32 v14, v0 -; GFX10-NEXT: v_mov_b32_e32 v15, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_mov_b32_e32 v7, v0 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_mov_b32_e32 v9, v0 +; GFX10-NEXT: v_mov_b32_e32 v10, v0 +; GFX10-NEXT: v_mov_b32_e32 v11, v0 +; GFX10-NEXT: v_mov_b32_e32 v12, v0 +; GFX10-NEXT: v_mov_b32_e32 v13, v0 +; GFX10-NEXT: v_mov_b32_e32 v14, v0 +; GFX10-NEXT: v_mov_b32_e32 v15, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v16i8_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: v_mov_b32_e32 v7, v0 -; GFX11-NEXT: v_mov_b32_e32 v8, v0 -; GFX11-NEXT: v_mov_b32_e32 v9, v0 -; GFX11-NEXT: v_mov_b32_e32 v10, v0 -; GFX11-NEXT: v_mov_b32_e32 v11, v0 -; GFX11-NEXT: v_mov_b32_e32 v12, v0 -; GFX11-NEXT: v_mov_b32_e32 v13, v0 -; GFX11-NEXT: v_mov_b32_e32 v14, v0 -; GFX11-NEXT: v_mov_b32_e32 v15, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, v0 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: v_mov_b32_e32 v7, v0 +; GFX11-NEXT: v_mov_b32_e32 v8, v0 +; GFX11-NEXT: v_mov_b32_e32 v9, v0 +; GFX11-NEXT: v_mov_b32_e32 v10, v0 +; GFX11-NEXT: v_mov_b32_e32 v11, v0 +; GFX11-NEXT: v_mov_b32_e32 v12, v0 +; GFX11-NEXT: v_mov_b32_e32 v13, v0 +; GFX11-NEXT: v_mov_b32_e32 v14, v0 +; GFX11-NEXT: v_mov_b32_e32 v15, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <16 x i8>, ptr addrspace(1) %arg0 %val1 = shufflevector <16 x i8> %val0, <16 x i8> poison, <16 x i32> @@ -207,121 +208,121 @@ entry: define <32 x i8> @shuffle_v32i8_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v32i8_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_mov_b32_e32 v9, v0 -; GFX9-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-NEXT: v_mov_b32_e32 v11, v0 -; GFX9-NEXT: v_mov_b32_e32 v12, v0 -; GFX9-NEXT: v_mov_b32_e32 v13, v0 -; GFX9-NEXT: v_mov_b32_e32 v14, v0 -; GFX9-NEXT: v_mov_b32_e32 v15, v0 -; GFX9-NEXT: v_mov_b32_e32 v16, v0 -; GFX9-NEXT: v_mov_b32_e32 v17, v0 -; GFX9-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-NEXT: v_mov_b32_e32 v19, v0 -; GFX9-NEXT: v_mov_b32_e32 v20, v0 -; GFX9-NEXT: v_mov_b32_e32 v21, v0 -; GFX9-NEXT: v_mov_b32_e32 v22, v0 -; GFX9-NEXT: v_mov_b32_e32 v23, v0 -; GFX9-NEXT: v_mov_b32_e32 v24, v0 -; GFX9-NEXT: v_mov_b32_e32 v25, v0 -; GFX9-NEXT: v_mov_b32_e32 v26, v0 -; GFX9-NEXT: v_mov_b32_e32 v27, v0 -; GFX9-NEXT: v_mov_b32_e32 v28, v0 -; GFX9-NEXT: v_mov_b32_e32 v29, v0 -; GFX9-NEXT: v_mov_b32_e32 v30, v0 -; GFX9-NEXT: v_mov_b32_e32 v31, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-NEXT: v_mov_b32_e32 v11, v0 +; GFX9-NEXT: v_mov_b32_e32 v12, v0 +; GFX9-NEXT: v_mov_b32_e32 v13, v0 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v15, v0 +; GFX9-NEXT: v_mov_b32_e32 v16, v0 +; GFX9-NEXT: v_mov_b32_e32 v17, v0 +; GFX9-NEXT: v_mov_b32_e32 v18, v0 +; GFX9-NEXT: v_mov_b32_e32 v19, v0 +; GFX9-NEXT: v_mov_b32_e32 v20, v0 +; GFX9-NEXT: v_mov_b32_e32 v21, v0 +; GFX9-NEXT: v_mov_b32_e32 v22, v0 +; GFX9-NEXT: v_mov_b32_e32 v23, v0 +; GFX9-NEXT: v_mov_b32_e32 v24, v0 +; GFX9-NEXT: v_mov_b32_e32 v25, v0 +; GFX9-NEXT: v_mov_b32_e32 v26, v0 +; GFX9-NEXT: v_mov_b32_e32 v27, v0 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: v_mov_b32_e32 v29, v0 +; GFX9-NEXT: v_mov_b32_e32 v30, v0 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v32i8_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_mov_b32_e32 v7, v0 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_mov_b32_e32 v11, v0 -; GFX10-NEXT: v_mov_b32_e32 v12, v0 -; GFX10-NEXT: v_mov_b32_e32 v13, v0 -; GFX10-NEXT: v_mov_b32_e32 v14, v0 -; GFX10-NEXT: v_mov_b32_e32 v15, v0 -; GFX10-NEXT: v_mov_b32_e32 v16, v0 -; GFX10-NEXT: v_mov_b32_e32 v17, v0 -; GFX10-NEXT: v_mov_b32_e32 v18, v0 -; GFX10-NEXT: v_mov_b32_e32 v19, v0 -; GFX10-NEXT: v_mov_b32_e32 v20, v0 -; GFX10-NEXT: v_mov_b32_e32 v21, v0 -; GFX10-NEXT: v_mov_b32_e32 v22, v0 -; GFX10-NEXT: v_mov_b32_e32 v23, v0 -; GFX10-NEXT: v_mov_b32_e32 v24, v0 -; GFX10-NEXT: v_mov_b32_e32 v25, v0 -; GFX10-NEXT: v_mov_b32_e32 v26, v0 -; GFX10-NEXT: v_mov_b32_e32 v27, v0 -; GFX10-NEXT: v_mov_b32_e32 v28, v0 -; GFX10-NEXT: v_mov_b32_e32 v29, v0 -; GFX10-NEXT: v_mov_b32_e32 v30, v0 -; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_mov_b32_e32 v7, v0 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_mov_b32_e32 v9, v0 +; GFX10-NEXT: v_mov_b32_e32 v10, v0 +; GFX10-NEXT: v_mov_b32_e32 v11, v0 +; GFX10-NEXT: v_mov_b32_e32 v12, v0 +; GFX10-NEXT: v_mov_b32_e32 v13, v0 +; GFX10-NEXT: v_mov_b32_e32 v14, v0 +; GFX10-NEXT: v_mov_b32_e32 v15, v0 +; GFX10-NEXT: v_mov_b32_e32 v16, v0 +; GFX10-NEXT: v_mov_b32_e32 v17, v0 +; GFX10-NEXT: v_mov_b32_e32 v18, v0 +; GFX10-NEXT: v_mov_b32_e32 v19, v0 +; GFX10-NEXT: v_mov_b32_e32 v20, v0 +; GFX10-NEXT: v_mov_b32_e32 v21, v0 +; GFX10-NEXT: v_mov_b32_e32 v22, v0 +; GFX10-NEXT: v_mov_b32_e32 v23, v0 +; GFX10-NEXT: v_mov_b32_e32 v24, v0 +; GFX10-NEXT: v_mov_b32_e32 v25, v0 +; GFX10-NEXT: v_mov_b32_e32 v26, v0 +; GFX10-NEXT: v_mov_b32_e32 v27, v0 +; GFX10-NEXT: v_mov_b32_e32 v28, v0 +; GFX10-NEXT: v_mov_b32_e32 v29, v0 +; GFX10-NEXT: v_mov_b32_e32 v30, v0 +; GFX10-NEXT: v_mov_b32_e32 v31, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v32i8_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: v_mov_b32_e32 v7, v0 -; GFX11-NEXT: v_mov_b32_e32 v8, v0 -; GFX11-NEXT: v_mov_b32_e32 v9, v0 -; GFX11-NEXT: v_mov_b32_e32 v10, v0 -; GFX11-NEXT: v_mov_b32_e32 v11, v0 -; GFX11-NEXT: v_mov_b32_e32 v12, v0 -; GFX11-NEXT: v_mov_b32_e32 v13, v0 -; GFX11-NEXT: v_mov_b32_e32 v14, v0 -; GFX11-NEXT: v_mov_b32_e32 v15, v0 -; GFX11-NEXT: v_mov_b32_e32 v16, v0 -; GFX11-NEXT: v_mov_b32_e32 v17, v0 -; GFX11-NEXT: v_mov_b32_e32 v18, v0 -; GFX11-NEXT: v_mov_b32_e32 v19, v0 -; GFX11-NEXT: v_mov_b32_e32 v20, v0 -; GFX11-NEXT: v_mov_b32_e32 v21, v0 -; GFX11-NEXT: v_mov_b32_e32 v22, v0 -; GFX11-NEXT: v_mov_b32_e32 v23, v0 -; GFX11-NEXT: v_mov_b32_e32 v24, v0 -; GFX11-NEXT: v_mov_b32_e32 v25, v0 -; GFX11-NEXT: v_mov_b32_e32 v26, v0 -; GFX11-NEXT: v_mov_b32_e32 v27, v0 -; GFX11-NEXT: v_mov_b32_e32 v28, v0 -; GFX11-NEXT: v_mov_b32_e32 v29, v0 -; GFX11-NEXT: v_mov_b32_e32 v30, v0 -; GFX11-NEXT: v_mov_b32_e32 v31, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, v0 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: v_mov_b32_e32 v7, v0 +; GFX11-NEXT: v_mov_b32_e32 v8, v0 +; GFX11-NEXT: v_mov_b32_e32 v9, v0 +; GFX11-NEXT: v_mov_b32_e32 v10, v0 +; GFX11-NEXT: v_mov_b32_e32 v11, v0 +; GFX11-NEXT: v_mov_b32_e32 v12, v0 +; GFX11-NEXT: v_mov_b32_e32 v13, v0 +; GFX11-NEXT: v_mov_b32_e32 v14, v0 +; GFX11-NEXT: v_mov_b32_e32 v15, v0 +; GFX11-NEXT: v_mov_b32_e32 v16, v0 +; GFX11-NEXT: v_mov_b32_e32 v17, v0 +; GFX11-NEXT: v_mov_b32_e32 v18, v0 +; GFX11-NEXT: v_mov_b32_e32 v19, v0 +; GFX11-NEXT: v_mov_b32_e32 v20, v0 +; GFX11-NEXT: v_mov_b32_e32 v21, v0 +; GFX11-NEXT: v_mov_b32_e32 v22, v0 +; GFX11-NEXT: v_mov_b32_e32 v23, v0 +; GFX11-NEXT: v_mov_b32_e32 v24, v0 +; GFX11-NEXT: v_mov_b32_e32 v25, v0 +; GFX11-NEXT: v_mov_b32_e32 v26, v0 +; GFX11-NEXT: v_mov_b32_e32 v27, v0 +; GFX11-NEXT: v_mov_b32_e32 v28, v0 +; GFX11-NEXT: v_mov_b32_e32 v29, v0 +; GFX11-NEXT: v_mov_b32_e32 v30, v0 +; GFX11-NEXT: v_mov_b32_e32 v31, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <32 x i8>, ptr addrspace(1) %arg0 %val1 = shufflevector <32 x i8> %val0, <32 x i8> poison, <32 x i32> @@ -331,28 +332,28 @@ entry: define <2 x i16> @shuffle_v2i16_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v2i16_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v2i16_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v2i16_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <2 x i16>, ptr addrspace(1) %arg0 %val1 = shufflevector <2 x i16> %val0, <2 x i16> poison, <2 x i32> @@ -362,32 +363,32 @@ entry: define <4 x i16> @shuffle_v4i16_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v4i16_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4i16_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4i16_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <4 x i16>, ptr addrspace(1) %arg0 %val1 = shufflevector <4 x i16> %val0, <4 x i16> poison, <4 x i32> @@ -397,38 +398,38 @@ entry: define <8 x i16> @shuffle_v8i16_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v8i16_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v8i16_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v8i16_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <8 x i16>, ptr addrspace(1) %arg0 %val1 = shufflevector <8 x i16> %val0, <8 x i16> poison, <8 x i32> @@ -438,50 +439,50 @@ entry: define <16 x i16> @shuffle_v16i16_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v16i16_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v16i16_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_mov_b32_e32 v7, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_mov_b32_e32 v7, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v16i16_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: v_mov_b32_e32 v7, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, v0 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: v_mov_b32_e32 v7, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <16 x i16>, ptr addrspace(1) %arg0 %val1 = shufflevector <16 x i16> %val0, <16 x i16> poison, <16 x i32> @@ -491,74 +492,74 @@ entry: define <32 x i16> @shuffle_v32i16_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v32i16_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_mov_b32_e32 v9, v0 -; GFX9-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-NEXT: v_mov_b32_e32 v11, v0 -; GFX9-NEXT: v_mov_b32_e32 v12, v0 -; GFX9-NEXT: v_mov_b32_e32 v13, v0 -; GFX9-NEXT: v_mov_b32_e32 v14, v0 -; GFX9-NEXT: v_mov_b32_e32 v15, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-NEXT: v_mov_b32_e32 v11, v0 +; GFX9-NEXT: v_mov_b32_e32 v12, v0 +; GFX9-NEXT: v_mov_b32_e32 v13, v0 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v15, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v32i16_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_mov_b32_e32 v7, v0 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_mov_b32_e32 v11, v0 -; GFX10-NEXT: v_mov_b32_e32 v12, v0 -; GFX10-NEXT: v_mov_b32_e32 v13, v0 -; GFX10-NEXT: v_mov_b32_e32 v14, v0 -; GFX10-NEXT: v_mov_b32_e32 v15, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_mov_b32_e32 v7, v0 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_mov_b32_e32 v9, v0 +; GFX10-NEXT: v_mov_b32_e32 v10, v0 +; GFX10-NEXT: v_mov_b32_e32 v11, v0 +; GFX10-NEXT: v_mov_b32_e32 v12, v0 +; GFX10-NEXT: v_mov_b32_e32 v13, v0 +; GFX10-NEXT: v_mov_b32_e32 v14, v0 +; GFX10-NEXT: v_mov_b32_e32 v15, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v32i16_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: v_mov_b32_e32 v7, v0 -; GFX11-NEXT: v_mov_b32_e32 v8, v0 -; GFX11-NEXT: v_mov_b32_e32 v9, v0 -; GFX11-NEXT: v_mov_b32_e32 v10, v0 -; GFX11-NEXT: v_mov_b32_e32 v11, v0 -; GFX11-NEXT: v_mov_b32_e32 v12, v0 -; GFX11-NEXT: v_mov_b32_e32 v13, v0 -; GFX11-NEXT: v_mov_b32_e32 v14, v0 -; GFX11-NEXT: v_mov_b32_e32 v15, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, v0 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: v_mov_b32_e32 v7, v0 +; GFX11-NEXT: v_mov_b32_e32 v8, v0 +; GFX11-NEXT: v_mov_b32_e32 v9, v0 +; GFX11-NEXT: v_mov_b32_e32 v10, v0 +; GFX11-NEXT: v_mov_b32_e32 v11, v0 +; GFX11-NEXT: v_mov_b32_e32 v12, v0 +; GFX11-NEXT: v_mov_b32_e32 v13, v0 +; GFX11-NEXT: v_mov_b32_e32 v14, v0 +; GFX11-NEXT: v_mov_b32_e32 v15, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <32 x i16>, ptr addrspace(1) %arg0 %val1 = shufflevector <32 x i16> %val0, <32 x i16> poison, <32 x i32> @@ -568,27 +569,27 @@ entry: define <2 x i32> @shuffle_v2i32_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v2i32_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v2i32_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v2i32_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <2 x i32>, ptr addrspace(1) %arg0 %val1 = shufflevector <2 x i32> %val0, <2 x i32> poison, <2 x i32> @@ -598,33 +599,33 @@ entry: define <4 x i32> @shuffle_v4i32_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v4i32_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4i32_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4i32_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <4 x i32>, ptr addrspace(1) %arg0 %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <4 x i32> @@ -634,45 +635,45 @@ entry: define <8 x i32> @shuffle_v8i32_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v8i32_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v8i32_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_mov_b32_e32 v7, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_mov_b32_e32 v7, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v8i32_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: v_mov_b32_e32 v7, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, v0 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: v_mov_b32_e32 v7, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <8 x i32>, ptr addrspace(1) %arg0 %val1 = shufflevector <8 x i32> %val0, <8 x i32> poison, <8 x i32> @@ -682,69 +683,69 @@ entry: define <16 x i32> @shuffle_v16i32_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v16i32_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_mov_b32_e32 v9, v0 -; GFX9-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-NEXT: v_mov_b32_e32 v11, v0 -; GFX9-NEXT: v_mov_b32_e32 v12, v0 -; GFX9-NEXT: v_mov_b32_e32 v13, v0 -; GFX9-NEXT: v_mov_b32_e32 v14, v0 -; GFX9-NEXT: v_mov_b32_e32 v15, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-NEXT: v_mov_b32_e32 v11, v0 +; GFX9-NEXT: v_mov_b32_e32 v12, v0 +; GFX9-NEXT: v_mov_b32_e32 v13, v0 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v15, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v16i32_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_mov_b32_e32 v7, v0 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_mov_b32_e32 v11, v0 -; GFX10-NEXT: v_mov_b32_e32 v12, v0 -; GFX10-NEXT: v_mov_b32_e32 v13, v0 -; GFX10-NEXT: v_mov_b32_e32 v14, v0 -; GFX10-NEXT: v_mov_b32_e32 v15, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_mov_b32_e32 v7, v0 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_mov_b32_e32 v9, v0 +; GFX10-NEXT: v_mov_b32_e32 v10, v0 +; GFX10-NEXT: v_mov_b32_e32 v11, v0 +; GFX10-NEXT: v_mov_b32_e32 v12, v0 +; GFX10-NEXT: v_mov_b32_e32 v13, v0 +; GFX10-NEXT: v_mov_b32_e32 v14, v0 +; GFX10-NEXT: v_mov_b32_e32 v15, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v16i32_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: v_mov_b32_e32 v7, v0 -; GFX11-NEXT: v_mov_b32_e32 v8, v0 -; GFX11-NEXT: v_mov_b32_e32 v9, v0 -; GFX11-NEXT: v_mov_b32_e32 v10, v0 -; GFX11-NEXT: v_mov_b32_e32 v11, v0 -; GFX11-NEXT: v_mov_b32_e32 v12, v0 -; GFX11-NEXT: v_mov_b32_e32 v13, v0 -; GFX11-NEXT: v_mov_b32_e32 v14, v0 -; GFX11-NEXT: v_mov_b32_e32 v15, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, v0 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: v_mov_b32_e32 v7, v0 +; GFX11-NEXT: v_mov_b32_e32 v8, v0 +; GFX11-NEXT: v_mov_b32_e32 v9, v0 +; GFX11-NEXT: v_mov_b32_e32 v10, v0 +; GFX11-NEXT: v_mov_b32_e32 v11, v0 +; GFX11-NEXT: v_mov_b32_e32 v12, v0 +; GFX11-NEXT: v_mov_b32_e32 v13, v0 +; GFX11-NEXT: v_mov_b32_e32 v14, v0 +; GFX11-NEXT: v_mov_b32_e32 v15, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <16 x i32>, ptr addrspace(1) %arg0 %val1 = shufflevector <16 x i32> %val0, <16 x i32> poison, <16 x i32> @@ -754,117 +755,117 @@ entry: define <32 x i32> @shuffle_v32i32_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v32i32_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_mov_b32_e32 v9, v0 -; GFX9-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-NEXT: v_mov_b32_e32 v11, v0 -; GFX9-NEXT: v_mov_b32_e32 v12, v0 -; GFX9-NEXT: v_mov_b32_e32 v13, v0 -; GFX9-NEXT: v_mov_b32_e32 v14, v0 -; GFX9-NEXT: v_mov_b32_e32 v15, v0 -; GFX9-NEXT: v_mov_b32_e32 v16, v0 -; GFX9-NEXT: v_mov_b32_e32 v17, v0 -; GFX9-NEXT: v_mov_b32_e32 v18, v0 -; GFX9-NEXT: v_mov_b32_e32 v19, v0 -; GFX9-NEXT: v_mov_b32_e32 v20, v0 -; GFX9-NEXT: v_mov_b32_e32 v21, v0 -; GFX9-NEXT: v_mov_b32_e32 v22, v0 -; GFX9-NEXT: v_mov_b32_e32 v23, v0 -; GFX9-NEXT: v_mov_b32_e32 v24, v0 -; GFX9-NEXT: v_mov_b32_e32 v25, v0 -; GFX9-NEXT: v_mov_b32_e32 v26, v0 -; GFX9-NEXT: v_mov_b32_e32 v27, v0 -; GFX9-NEXT: v_mov_b32_e32 v28, v0 -; GFX9-NEXT: v_mov_b32_e32 v29, v0 -; GFX9-NEXT: v_mov_b32_e32 v30, v0 -; GFX9-NEXT: v_mov_b32_e32 v31, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-NEXT: v_mov_b32_e32 v11, v0 +; GFX9-NEXT: v_mov_b32_e32 v12, v0 +; GFX9-NEXT: v_mov_b32_e32 v13, v0 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v15, v0 +; GFX9-NEXT: v_mov_b32_e32 v16, v0 +; GFX9-NEXT: v_mov_b32_e32 v17, v0 +; GFX9-NEXT: v_mov_b32_e32 v18, v0 +; GFX9-NEXT: v_mov_b32_e32 v19, v0 +; GFX9-NEXT: v_mov_b32_e32 v20, v0 +; GFX9-NEXT: v_mov_b32_e32 v21, v0 +; GFX9-NEXT: v_mov_b32_e32 v22, v0 +; GFX9-NEXT: v_mov_b32_e32 v23, v0 +; GFX9-NEXT: v_mov_b32_e32 v24, v0 +; GFX9-NEXT: v_mov_b32_e32 v25, v0 +; GFX9-NEXT: v_mov_b32_e32 v26, v0 +; GFX9-NEXT: v_mov_b32_e32 v27, v0 +; GFX9-NEXT: v_mov_b32_e32 v28, v0 +; GFX9-NEXT: v_mov_b32_e32 v29, v0 +; GFX9-NEXT: v_mov_b32_e32 v30, v0 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v32i32_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_mov_b32_e32 v7, v0 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_mov_b32_e32 v11, v0 -; GFX10-NEXT: v_mov_b32_e32 v12, v0 -; GFX10-NEXT: v_mov_b32_e32 v13, v0 -; GFX10-NEXT: v_mov_b32_e32 v14, v0 -; GFX10-NEXT: v_mov_b32_e32 v15, v0 -; GFX10-NEXT: v_mov_b32_e32 v16, v0 -; GFX10-NEXT: v_mov_b32_e32 v17, v0 -; GFX10-NEXT: v_mov_b32_e32 v18, v0 -; GFX10-NEXT: v_mov_b32_e32 v19, v0 -; GFX10-NEXT: v_mov_b32_e32 v20, v0 -; GFX10-NEXT: v_mov_b32_e32 v21, v0 -; GFX10-NEXT: v_mov_b32_e32 v22, v0 -; GFX10-NEXT: v_mov_b32_e32 v23, v0 -; GFX10-NEXT: v_mov_b32_e32 v24, v0 -; GFX10-NEXT: v_mov_b32_e32 v25, v0 -; GFX10-NEXT: v_mov_b32_e32 v26, v0 -; GFX10-NEXT: v_mov_b32_e32 v27, v0 -; GFX10-NEXT: v_mov_b32_e32 v28, v0 -; GFX10-NEXT: v_mov_b32_e32 v29, v0 -; GFX10-NEXT: v_mov_b32_e32 v30, v0 -; GFX10-NEXT: v_mov_b32_e32 v31, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_mov_b32_e32 v7, v0 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_mov_b32_e32 v9, v0 +; GFX10-NEXT: v_mov_b32_e32 v10, v0 +; GFX10-NEXT: v_mov_b32_e32 v11, v0 +; GFX10-NEXT: v_mov_b32_e32 v12, v0 +; GFX10-NEXT: v_mov_b32_e32 v13, v0 +; GFX10-NEXT: v_mov_b32_e32 v14, v0 +; GFX10-NEXT: v_mov_b32_e32 v15, v0 +; GFX10-NEXT: v_mov_b32_e32 v16, v0 +; GFX10-NEXT: v_mov_b32_e32 v17, v0 +; GFX10-NEXT: v_mov_b32_e32 v18, v0 +; GFX10-NEXT: v_mov_b32_e32 v19, v0 +; GFX10-NEXT: v_mov_b32_e32 v20, v0 +; GFX10-NEXT: v_mov_b32_e32 v21, v0 +; GFX10-NEXT: v_mov_b32_e32 v22, v0 +; GFX10-NEXT: v_mov_b32_e32 v23, v0 +; GFX10-NEXT: v_mov_b32_e32 v24, v0 +; GFX10-NEXT: v_mov_b32_e32 v25, v0 +; GFX10-NEXT: v_mov_b32_e32 v26, v0 +; GFX10-NEXT: v_mov_b32_e32 v27, v0 +; GFX10-NEXT: v_mov_b32_e32 v28, v0 +; GFX10-NEXT: v_mov_b32_e32 v29, v0 +; GFX10-NEXT: v_mov_b32_e32 v30, v0 +; GFX10-NEXT: v_mov_b32_e32 v31, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v32i32_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: v_mov_b32_e32 v7, v0 -; GFX11-NEXT: v_mov_b32_e32 v8, v0 -; GFX11-NEXT: v_mov_b32_e32 v9, v0 -; GFX11-NEXT: v_mov_b32_e32 v10, v0 -; GFX11-NEXT: v_mov_b32_e32 v11, v0 -; GFX11-NEXT: v_mov_b32_e32 v12, v0 -; GFX11-NEXT: v_mov_b32_e32 v13, v0 -; GFX11-NEXT: v_mov_b32_e32 v14, v0 -; GFX11-NEXT: v_mov_b32_e32 v15, v0 -; GFX11-NEXT: v_mov_b32_e32 v16, v0 -; GFX11-NEXT: v_mov_b32_e32 v17, v0 -; GFX11-NEXT: v_mov_b32_e32 v18, v0 -; GFX11-NEXT: v_mov_b32_e32 v19, v0 -; GFX11-NEXT: v_mov_b32_e32 v20, v0 -; GFX11-NEXT: v_mov_b32_e32 v21, v0 -; GFX11-NEXT: v_mov_b32_e32 v22, v0 -; GFX11-NEXT: v_mov_b32_e32 v23, v0 -; GFX11-NEXT: v_mov_b32_e32 v24, v0 -; GFX11-NEXT: v_mov_b32_e32 v25, v0 -; GFX11-NEXT: v_mov_b32_e32 v26, v0 -; GFX11-NEXT: v_mov_b32_e32 v27, v0 -; GFX11-NEXT: v_mov_b32_e32 v28, v0 -; GFX11-NEXT: v_mov_b32_e32 v29, v0 -; GFX11-NEXT: v_mov_b32_e32 v30, v0 -; GFX11-NEXT: v_mov_b32_e32 v31, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, v0 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: v_mov_b32_e32 v7, v0 +; GFX11-NEXT: v_mov_b32_e32 v8, v0 +; GFX11-NEXT: v_mov_b32_e32 v9, v0 +; GFX11-NEXT: v_mov_b32_e32 v10, v0 +; GFX11-NEXT: v_mov_b32_e32 v11, v0 +; GFX11-NEXT: v_mov_b32_e32 v12, v0 +; GFX11-NEXT: v_mov_b32_e32 v13, v0 +; GFX11-NEXT: v_mov_b32_e32 v14, v0 +; GFX11-NEXT: v_mov_b32_e32 v15, v0 +; GFX11-NEXT: v_mov_b32_e32 v16, v0 +; GFX11-NEXT: v_mov_b32_e32 v17, v0 +; GFX11-NEXT: v_mov_b32_e32 v18, v0 +; GFX11-NEXT: v_mov_b32_e32 v19, v0 +; GFX11-NEXT: v_mov_b32_e32 v20, v0 +; GFX11-NEXT: v_mov_b32_e32 v21, v0 +; GFX11-NEXT: v_mov_b32_e32 v22, v0 +; GFX11-NEXT: v_mov_b32_e32 v23, v0 +; GFX11-NEXT: v_mov_b32_e32 v24, v0 +; GFX11-NEXT: v_mov_b32_e32 v25, v0 +; GFX11-NEXT: v_mov_b32_e32 v26, v0 +; GFX11-NEXT: v_mov_b32_e32 v27, v0 +; GFX11-NEXT: v_mov_b32_e32 v28, v0 +; GFX11-NEXT: v_mov_b32_e32 v29, v0 +; GFX11-NEXT: v_mov_b32_e32 v30, v0 +; GFX11-NEXT: v_mov_b32_e32 v31, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <32 x i32>, ptr addrspace(1) %arg0 %val1 = shufflevector <32 x i32> %val0, <32 x i32> poison, <32 x i32> @@ -874,28 +875,28 @@ entry: define <2 x bfloat> @shuffle_v2bf16_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v2bf16_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v2bf16_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v2bf16_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <2 x bfloat>, ptr addrspace(1) %arg0 %val1 = shufflevector <2 x bfloat> %val0, <2 x bfloat> poison, <2 x i32> @@ -905,31 +906,31 @@ entry: define <3 x bfloat> @shuffle_v3bf16_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v3bf16_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v1, v1, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v3bf16_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v1, v1, 0x7060302 -; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v1, v1, 0x7060302 +; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v3bf16_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v0, v1, v1, 0x7060302 -; GFX11-NEXT: v_alignbit_b32 v1, s0, v1, 16 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v1, v1, 0x7060302 +; GFX11-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <3 x bfloat>, ptr addrspace(1) %arg0 %val1 = shufflevector <3 x bfloat> %val0, <3 x bfloat> poison, <3 x i32> @@ -939,32 +940,32 @@ entry: define <4 x bfloat> @shuffle_v4bf16_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v4bf16_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4bf16_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4bf16_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 %val1 = shufflevector <4 x bfloat> %val0, <4 x bfloat> poison, <4 x i32> @@ -974,35 +975,35 @@ entry: define <6 x bfloat> @shuffle_v6bf16_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v6bf16_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v6bf16_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v6bf16_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <6 x bfloat>, ptr addrspace(1) %arg0 %val1 = shufflevector <6 x bfloat> %val0, <6 x bfloat> poison, <6 x i32> @@ -1012,38 +1013,38 @@ entry: define <8 x bfloat> @shuffle_v8bf16_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v8bf16_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v8bf16_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v8bf16_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0 %val1 = shufflevector <8 x bfloat> %val0, <8 x bfloat> poison, <8 x i32> @@ -1053,50 +1054,50 @@ entry: define <16 x bfloat> @shuffle_v16bf16_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v16bf16_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v16bf16_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_mov_b32_e32 v7, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_mov_b32_e32 v7, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v16bf16_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: v_mov_b32_e32 v7, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, v0 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: v_mov_b32_e32 v7, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <16 x bfloat>, ptr addrspace(1) %arg0 %val1 = shufflevector <16 x bfloat> %val0, <16 x bfloat> poison, <16 x i32> @@ -1106,74 +1107,74 @@ entry: define <32 x bfloat> @shuffle_v32bf16_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v32bf16_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_mov_b32_e32 v9, v0 -; GFX9-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-NEXT: v_mov_b32_e32 v11, v0 -; GFX9-NEXT: v_mov_b32_e32 v12, v0 -; GFX9-NEXT: v_mov_b32_e32 v13, v0 -; GFX9-NEXT: v_mov_b32_e32 v14, v0 -; GFX9-NEXT: v_mov_b32_e32 v15, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-NEXT: v_mov_b32_e32 v11, v0 +; GFX9-NEXT: v_mov_b32_e32 v12, v0 +; GFX9-NEXT: v_mov_b32_e32 v13, v0 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v15, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v32bf16_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_mov_b32_e32 v7, v0 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_mov_b32_e32 v11, v0 -; GFX10-NEXT: v_mov_b32_e32 v12, v0 -; GFX10-NEXT: v_mov_b32_e32 v13, v0 -; GFX10-NEXT: v_mov_b32_e32 v14, v0 -; GFX10-NEXT: v_mov_b32_e32 v15, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_mov_b32_e32 v7, v0 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_mov_b32_e32 v9, v0 +; GFX10-NEXT: v_mov_b32_e32 v10, v0 +; GFX10-NEXT: v_mov_b32_e32 v11, v0 +; GFX10-NEXT: v_mov_b32_e32 v12, v0 +; GFX10-NEXT: v_mov_b32_e32 v13, v0 +; GFX10-NEXT: v_mov_b32_e32 v14, v0 +; GFX10-NEXT: v_mov_b32_e32 v15, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v32bf16_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: v_mov_b32_e32 v7, v0 -; GFX11-NEXT: v_mov_b32_e32 v8, v0 -; GFX11-NEXT: v_mov_b32_e32 v9, v0 -; GFX11-NEXT: v_mov_b32_e32 v10, v0 -; GFX11-NEXT: v_mov_b32_e32 v11, v0 -; GFX11-NEXT: v_mov_b32_e32 v12, v0 -; GFX11-NEXT: v_mov_b32_e32 v13, v0 -; GFX11-NEXT: v_mov_b32_e32 v14, v0 -; GFX11-NEXT: v_mov_b32_e32 v15, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, v0 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: v_mov_b32_e32 v7, v0 +; GFX11-NEXT: v_mov_b32_e32 v8, v0 +; GFX11-NEXT: v_mov_b32_e32 v9, v0 +; GFX11-NEXT: v_mov_b32_e32 v10, v0 +; GFX11-NEXT: v_mov_b32_e32 v11, v0 +; GFX11-NEXT: v_mov_b32_e32 v12, v0 +; GFX11-NEXT: v_mov_b32_e32 v13, v0 +; GFX11-NEXT: v_mov_b32_e32 v14, v0 +; GFX11-NEXT: v_mov_b32_e32 v15, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <32 x bfloat>, ptr addrspace(1) %arg0 %val1 = shufflevector <32 x bfloat> %val0, <32 x bfloat> poison, <32 x i32> @@ -1183,28 +1184,28 @@ entry: define <2 x half> @shuffle_v2f16_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v2f16_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v2f16_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v2f16_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <2 x half>, ptr addrspace(1) %arg0 %val1 = shufflevector <2 x half> %val0, <2 x half> poison, <2 x i32> @@ -1214,31 +1215,31 @@ entry: define <3 x half> @shuffle_v3f16_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v3f16_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v1, v1, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v1, v1, s4 +; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v3f16_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v1, v1, 0x7060302 -; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v1, v1, 0x7060302 +; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v3f16_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v0, v1, v1, 0x7060302 -; GFX11-NEXT: v_alignbit_b32 v1, s0, v1, 16 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v1, v1, 0x7060302 +; GFX11-NEXT: v_alignbit_b32 v1, s0, v1, 16 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <3 x half>, ptr addrspace(1) %arg0 %val1 = shufflevector <3 x half> %val0, <3 x half> poison, <3 x i32> @@ -1248,32 +1249,32 @@ entry: define <4 x half> @shuffle_v4f16_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v4f16_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f16_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <4 x half>, ptr addrspace(1) %arg0 %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <4 x i32> @@ -1283,35 +1284,35 @@ entry: define <6 x half> @shuffle_v6f16_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v6f16_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v6f16_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v6f16_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <6 x half>, ptr addrspace(1) %arg0 %val1 = shufflevector <6 x half> %val0, <6 x half> poison, <6 x i32> @@ -1321,38 +1322,38 @@ entry: define <8 x half> @shuffle_v8f16_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v8f16_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v8f16_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v8f16_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <8 x half>, ptr addrspace(1) %arg0 %val1 = shufflevector <8 x half> %val0, <8 x half> poison, <8 x i32> @@ -1362,50 +1363,50 @@ entry: define <16 x half> @shuffle_v16f16_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v16f16_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v16f16_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_mov_b32_e32 v7, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_mov_b32_e32 v7, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v16f16_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: v_mov_b32_e32 v7, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, v0 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: v_mov_b32_e32 v7, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <16 x half>, ptr addrspace(1) %arg0 %val1 = shufflevector <16 x half> %val0, <16 x half> poison, <16 x i32> @@ -1415,74 +1416,74 @@ entry: define <32 x half> @shuffle_v32f16_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v32f16_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-NEXT: v_mov_b32_e32 v5, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, v0 -; GFX9-NEXT: v_mov_b32_e32 v9, v0 -; GFX9-NEXT: v_mov_b32_e32 v10, v0 -; GFX9-NEXT: v_mov_b32_e32 v11, v0 -; GFX9-NEXT: v_mov_b32_e32 v12, v0 -; GFX9-NEXT: v_mov_b32_e32 v13, v0 -; GFX9-NEXT: v_mov_b32_e32 v14, v0 -; GFX9-NEXT: v_mov_b32_e32 v15, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-NEXT: v_mov_b32_e32 v11, v0 +; GFX9-NEXT: v_mov_b32_e32 v12, v0 +; GFX9-NEXT: v_mov_b32_e32 v13, v0 +; GFX9-NEXT: v_mov_b32_e32 v14, v0 +; GFX9-NEXT: v_mov_b32_e32 v15, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v32f16_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v0 -; GFX10-NEXT: v_mov_b32_e32 v6, v0 -; GFX10-NEXT: v_mov_b32_e32 v7, v0 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_mov_b32_e32 v9, v0 -; GFX10-NEXT: v_mov_b32_e32 v10, v0 -; GFX10-NEXT: v_mov_b32_e32 v11, v0 -; GFX10-NEXT: v_mov_b32_e32 v12, v0 -; GFX10-NEXT: v_mov_b32_e32 v13, v0 -; GFX10-NEXT: v_mov_b32_e32 v14, v0 -; GFX10-NEXT: v_mov_b32_e32 v15, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v6, v0 +; GFX10-NEXT: v_mov_b32_e32 v7, v0 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_mov_b32_e32 v9, v0 +; GFX10-NEXT: v_mov_b32_e32 v10, v0 +; GFX10-NEXT: v_mov_b32_e32 v11, v0 +; GFX10-NEXT: v_mov_b32_e32 v12, v0 +; GFX10-NEXT: v_mov_b32_e32 v13, v0 +; GFX10-NEXT: v_mov_b32_e32 v14, v0 +; GFX10-NEXT: v_mov_b32_e32 v15, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v32f16_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, v0 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_mov_b32_e32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: v_mov_b32_e32 v7, v0 -; GFX11-NEXT: v_mov_b32_e32 v8, v0 -; GFX11-NEXT: v_mov_b32_e32 v9, v0 -; GFX11-NEXT: v_mov_b32_e32 v10, v0 -; GFX11-NEXT: v_mov_b32_e32 v11, v0 -; GFX11-NEXT: v_mov_b32_e32 v12, v0 -; GFX11-NEXT: v_mov_b32_e32 v13, v0 -; GFX11-NEXT: v_mov_b32_e32 v14, v0 -; GFX11-NEXT: v_mov_b32_e32 v15, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, v0 +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: v_mov_b32_e32 v7, v0 +; GFX11-NEXT: v_mov_b32_e32 v8, v0 +; GFX11-NEXT: v_mov_b32_e32 v9, v0 +; GFX11-NEXT: v_mov_b32_e32 v10, v0 +; GFX11-NEXT: v_mov_b32_e32 v11, v0 +; GFX11-NEXT: v_mov_b32_e32 v12, v0 +; GFX11-NEXT: v_mov_b32_e32 v13, v0 +; GFX11-NEXT: v_mov_b32_e32 v14, v0 +; GFX11-NEXT: v_mov_b32_e32 v15, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <32 x half>, ptr addrspace(1) %arg0 %val1 = shufflevector <32 x half> %val0, <32 x half> poison, <32 x i32> @@ -1492,27 +1493,27 @@ entry: define <2 x float> @shuffle_v2f32_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v2f32_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v2f32_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v2f32_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <2 x float>, ptr addrspace(1) %arg0 %val1 = shufflevector <2 x float> %val0, <2 x float> poison, <2 x i32> @@ -1522,30 +1523,30 @@ entry: define <3 x float> @shuffle_v3f32_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v3f32_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v3f32_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx3 v[0:2], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v3f32_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b96 v[0:2], v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <3 x float>, ptr addrspace(1) %arg0 %val1 = shufflevector <3 x float> %val0, <3 x float> poison, <3 x i32> @@ -1555,33 +1556,33 @@ entry: define <4 x float> @shuffle_v4f32_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v4f32_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f32_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f32_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v3, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <4 x float>, ptr addrspace(1) %arg0 %val1 = shufflevector <4 x float> %val0, <4 x float> poison, <4 x i32> @@ -1591,39 +1592,39 @@ entry: define <6 x float> @shuffle_v6f32_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v6f32_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v6f32_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v6f32_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v3, v1 -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v5, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mov_b32_e32 v5, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <6 x float>, ptr addrspace(1) %arg0 %val1 = shufflevector <6 x float> %val0, <6 x float> poison, <6 x i32> @@ -1633,45 +1634,45 @@ entry: define <8 x float> @shuffle_v8f32_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v8f32_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v8f32_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v7, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v7, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v8f32_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v3, v1 -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v5, v1 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v7, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mov_b32_e32 v5, v1 +; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_mov_b32_e32 v7, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <8 x float>, ptr addrspace(1) %arg0 %val1 = shufflevector <8 x float> %val0, <8 x float> poison, <8 x i32> @@ -1681,69 +1682,69 @@ entry: define <16 x float> @shuffle_v16f32_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v16f32_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v10, v1 -; GFX9-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-NEXT: v_mov_b32_e32 v12, v1 -; GFX9-NEXT: v_mov_b32_e32 v13, v1 -; GFX9-NEXT: v_mov_b32_e32 v14, v1 -; GFX9-NEXT: v_mov_b32_e32 v15, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-NEXT: v_mov_b32_e32 v12, v1 +; GFX9-NEXT: v_mov_b32_e32 v13, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v1 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v16f32_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v7, v1 -; GFX10-NEXT: v_mov_b32_e32 v8, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-NEXT: v_mov_b32_e32 v10, v1 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v12, v1 -; GFX10-NEXT: v_mov_b32_e32 v13, v1 -; GFX10-NEXT: v_mov_b32_e32 v14, v1 -; GFX10-NEXT: v_mov_b32_e32 v15, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v7, v1 +; GFX10-NEXT: v_mov_b32_e32 v8, v1 +; GFX10-NEXT: v_mov_b32_e32 v9, v1 +; GFX10-NEXT: v_mov_b32_e32 v10, v1 +; GFX10-NEXT: v_mov_b32_e32 v11, v1 +; GFX10-NEXT: v_mov_b32_e32 v12, v1 +; GFX10-NEXT: v_mov_b32_e32 v13, v1 +; GFX10-NEXT: v_mov_b32_e32 v14, v1 +; GFX10-NEXT: v_mov_b32_e32 v15, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v16f32_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v3, v1 -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v5, v1 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v7, v1 -; GFX11-NEXT: v_mov_b32_e32 v8, v1 -; GFX11-NEXT: v_mov_b32_e32 v9, v1 -; GFX11-NEXT: v_mov_b32_e32 v10, v1 -; GFX11-NEXT: v_mov_b32_e32 v11, v1 -; GFX11-NEXT: v_mov_b32_e32 v12, v1 -; GFX11-NEXT: v_mov_b32_e32 v13, v1 -; GFX11-NEXT: v_mov_b32_e32 v14, v1 -; GFX11-NEXT: v_mov_b32_e32 v15, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mov_b32_e32 v5, v1 +; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_mov_b32_e32 v7, v1 +; GFX11-NEXT: v_mov_b32_e32 v8, v1 +; GFX11-NEXT: v_mov_b32_e32 v9, v1 +; GFX11-NEXT: v_mov_b32_e32 v10, v1 +; GFX11-NEXT: v_mov_b32_e32 v11, v1 +; GFX11-NEXT: v_mov_b32_e32 v12, v1 +; GFX11-NEXT: v_mov_b32_e32 v13, v1 +; GFX11-NEXT: v_mov_b32_e32 v14, v1 +; GFX11-NEXT: v_mov_b32_e32 v15, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <16 x float>, ptr addrspace(1) %arg0 %val1 = shufflevector <16 x float> %val0, <16 x float> poison, <16 x i32> @@ -1753,117 +1754,117 @@ entry: define <32 x float> @shuffle_v32f32_rebroadcast(ptr addrspace(1) %arg0) { ; GFX9-LABEL: shuffle_v32f32_rebroadcast: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v1 -; GFX9-NEXT: v_mov_b32_e32 v9, v1 -; GFX9-NEXT: v_mov_b32_e32 v10, v1 -; GFX9-NEXT: v_mov_b32_e32 v11, v1 -; GFX9-NEXT: v_mov_b32_e32 v12, v1 -; GFX9-NEXT: v_mov_b32_e32 v13, v1 -; GFX9-NEXT: v_mov_b32_e32 v14, v1 -; GFX9-NEXT: v_mov_b32_e32 v15, v1 -; GFX9-NEXT: v_mov_b32_e32 v16, v1 -; GFX9-NEXT: v_mov_b32_e32 v17, v1 -; GFX9-NEXT: v_mov_b32_e32 v18, v1 -; GFX9-NEXT: v_mov_b32_e32 v19, v1 -; GFX9-NEXT: v_mov_b32_e32 v20, v1 -; GFX9-NEXT: v_mov_b32_e32 v21, v1 -; GFX9-NEXT: v_mov_b32_e32 v22, v1 -; GFX9-NEXT: v_mov_b32_e32 v23, v1 -; GFX9-NEXT: v_mov_b32_e32 v24, v1 -; GFX9-NEXT: v_mov_b32_e32 v25, v1 -; GFX9-NEXT: v_mov_b32_e32 v26, v1 -; GFX9-NEXT: v_mov_b32_e32 v27, v1 -; GFX9-NEXT: v_mov_b32_e32 v28, v1 -; GFX9-NEXT: v_mov_b32_e32 v29, v1 -; GFX9-NEXT: v_mov_b32_e32 v30, v1 -; GFX9-NEXT: v_mov_b32_e32 v31, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-NEXT: v_mov_b32_e32 v12, v1 +; GFX9-NEXT: v_mov_b32_e32 v13, v1 +; GFX9-NEXT: v_mov_b32_e32 v14, v1 +; GFX9-NEXT: v_mov_b32_e32 v15, v1 +; GFX9-NEXT: v_mov_b32_e32 v16, v1 +; GFX9-NEXT: v_mov_b32_e32 v17, v1 +; GFX9-NEXT: v_mov_b32_e32 v18, v1 +; GFX9-NEXT: v_mov_b32_e32 v19, v1 +; GFX9-NEXT: v_mov_b32_e32 v20, v1 +; GFX9-NEXT: v_mov_b32_e32 v21, v1 +; GFX9-NEXT: v_mov_b32_e32 v22, v1 +; GFX9-NEXT: v_mov_b32_e32 v23, v1 +; GFX9-NEXT: v_mov_b32_e32 v24, v1 +; GFX9-NEXT: v_mov_b32_e32 v25, v1 +; GFX9-NEXT: v_mov_b32_e32 v26, v1 +; GFX9-NEXT: v_mov_b32_e32 v27, v1 +; GFX9-NEXT: v_mov_b32_e32 v28, v1 +; GFX9-NEXT: v_mov_b32_e32 v29, v1 +; GFX9-NEXT: v_mov_b32_e32 v30, v1 +; GFX9-NEXT: v_mov_b32_e32 v31, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v32f32_rebroadcast: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v1 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v6, v1 -; GFX10-NEXT: v_mov_b32_e32 v7, v1 -; GFX10-NEXT: v_mov_b32_e32 v8, v1 -; GFX10-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-NEXT: v_mov_b32_e32 v10, v1 -; GFX10-NEXT: v_mov_b32_e32 v11, v1 -; GFX10-NEXT: v_mov_b32_e32 v12, v1 -; GFX10-NEXT: v_mov_b32_e32 v13, v1 -; GFX10-NEXT: v_mov_b32_e32 v14, v1 -; GFX10-NEXT: v_mov_b32_e32 v15, v1 -; GFX10-NEXT: v_mov_b32_e32 v16, v1 -; GFX10-NEXT: v_mov_b32_e32 v17, v1 -; GFX10-NEXT: v_mov_b32_e32 v18, v1 -; GFX10-NEXT: v_mov_b32_e32 v19, v1 -; GFX10-NEXT: v_mov_b32_e32 v20, v1 -; GFX10-NEXT: v_mov_b32_e32 v21, v1 -; GFX10-NEXT: v_mov_b32_e32 v22, v1 -; GFX10-NEXT: v_mov_b32_e32 v23, v1 -; GFX10-NEXT: v_mov_b32_e32 v24, v1 -; GFX10-NEXT: v_mov_b32_e32 v25, v1 -; GFX10-NEXT: v_mov_b32_e32 v26, v1 -; GFX10-NEXT: v_mov_b32_e32 v27, v1 -; GFX10-NEXT: v_mov_b32_e32 v28, v1 -; GFX10-NEXT: v_mov_b32_e32 v29, v1 -; GFX10-NEXT: v_mov_b32_e32 v30, v1 -; GFX10-NEXT: v_mov_b32_e32 v31, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v7, v1 +; GFX10-NEXT: v_mov_b32_e32 v8, v1 +; GFX10-NEXT: v_mov_b32_e32 v9, v1 +; GFX10-NEXT: v_mov_b32_e32 v10, v1 +; GFX10-NEXT: v_mov_b32_e32 v11, v1 +; GFX10-NEXT: v_mov_b32_e32 v12, v1 +; GFX10-NEXT: v_mov_b32_e32 v13, v1 +; GFX10-NEXT: v_mov_b32_e32 v14, v1 +; GFX10-NEXT: v_mov_b32_e32 v15, v1 +; GFX10-NEXT: v_mov_b32_e32 v16, v1 +; GFX10-NEXT: v_mov_b32_e32 v17, v1 +; GFX10-NEXT: v_mov_b32_e32 v18, v1 +; GFX10-NEXT: v_mov_b32_e32 v19, v1 +; GFX10-NEXT: v_mov_b32_e32 v20, v1 +; GFX10-NEXT: v_mov_b32_e32 v21, v1 +; GFX10-NEXT: v_mov_b32_e32 v22, v1 +; GFX10-NEXT: v_mov_b32_e32 v23, v1 +; GFX10-NEXT: v_mov_b32_e32 v24, v1 +; GFX10-NEXT: v_mov_b32_e32 v25, v1 +; GFX10-NEXT: v_mov_b32_e32 v26, v1 +; GFX10-NEXT: v_mov_b32_e32 v27, v1 +; GFX10-NEXT: v_mov_b32_e32 v28, v1 +; GFX10-NEXT: v_mov_b32_e32 v29, v1 +; GFX10-NEXT: v_mov_b32_e32 v30, v1 +; GFX10-NEXT: v_mov_b32_e32 v31, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v32f32_rebroadcast: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v3, v1 -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v5, v1 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v7, v1 -; GFX11-NEXT: v_mov_b32_e32 v8, v1 -; GFX11-NEXT: v_mov_b32_e32 v9, v1 -; GFX11-NEXT: v_mov_b32_e32 v10, v1 -; GFX11-NEXT: v_mov_b32_e32 v11, v1 -; GFX11-NEXT: v_mov_b32_e32 v12, v1 -; GFX11-NEXT: v_mov_b32_e32 v13, v1 -; GFX11-NEXT: v_mov_b32_e32 v14, v1 -; GFX11-NEXT: v_mov_b32_e32 v15, v1 -; GFX11-NEXT: v_mov_b32_e32 v16, v1 -; GFX11-NEXT: v_mov_b32_e32 v17, v1 -; GFX11-NEXT: v_mov_b32_e32 v18, v1 -; GFX11-NEXT: v_mov_b32_e32 v19, v1 -; GFX11-NEXT: v_mov_b32_e32 v20, v1 -; GFX11-NEXT: v_mov_b32_e32 v21, v1 -; GFX11-NEXT: v_mov_b32_e32 v22, v1 -; GFX11-NEXT: v_mov_b32_e32 v23, v1 -; GFX11-NEXT: v_mov_b32_e32 v24, v1 -; GFX11-NEXT: v_mov_b32_e32 v25, v1 -; GFX11-NEXT: v_mov_b32_e32 v26, v1 -; GFX11-NEXT: v_mov_b32_e32 v27, v1 -; GFX11-NEXT: v_mov_b32_e32 v28, v1 -; GFX11-NEXT: v_mov_b32_e32 v29, v1 -; GFX11-NEXT: v_mov_b32_e32 v30, v1 -; GFX11-NEXT: v_mov_b32_e32 v31, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mov_b32_e32 v5, v1 +; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_mov_b32_e32 v7, v1 +; GFX11-NEXT: v_mov_b32_e32 v8, v1 +; GFX11-NEXT: v_mov_b32_e32 v9, v1 +; GFX11-NEXT: v_mov_b32_e32 v10, v1 +; GFX11-NEXT: v_mov_b32_e32 v11, v1 +; GFX11-NEXT: v_mov_b32_e32 v12, v1 +; GFX11-NEXT: v_mov_b32_e32 v13, v1 +; GFX11-NEXT: v_mov_b32_e32 v14, v1 +; GFX11-NEXT: v_mov_b32_e32 v15, v1 +; GFX11-NEXT: v_mov_b32_e32 v16, v1 +; GFX11-NEXT: v_mov_b32_e32 v17, v1 +; GFX11-NEXT: v_mov_b32_e32 v18, v1 +; GFX11-NEXT: v_mov_b32_e32 v19, v1 +; GFX11-NEXT: v_mov_b32_e32 v20, v1 +; GFX11-NEXT: v_mov_b32_e32 v21, v1 +; GFX11-NEXT: v_mov_b32_e32 v22, v1 +; GFX11-NEXT: v_mov_b32_e32 v23, v1 +; GFX11-NEXT: v_mov_b32_e32 v24, v1 +; GFX11-NEXT: v_mov_b32_e32 v25, v1 +; GFX11-NEXT: v_mov_b32_e32 v26, v1 +; GFX11-NEXT: v_mov_b32_e32 v27, v1 +; GFX11-NEXT: v_mov_b32_e32 v28, v1 +; GFX11-NEXT: v_mov_b32_e32 v29, v1 +; GFX11-NEXT: v_mov_b32_e32 v30, v1 +; GFX11-NEXT: v_mov_b32_e32 v31, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %val0 = load <32 x float>, ptr addrspace(1) %arg0 %val1 = shufflevector <32 x float> %val0, <32 x float> poison, <32 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index b85bd4c634668..a00ee9ac0d5a9 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -32,44 +32,33 @@ define <4 x half> @shuffle_v4f16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1) % } define <4 x half> @shuffle_v4f16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { -; GX900-LABEL: shuffle_v4f16_234u: -; GX900: ; %bb.0: -; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GX900-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GX900-NEXT: s_waitcnt vmcnt(1) -; GX900-NEXT: v_mov_b32_e32 v0, v6 -; GX900-NEXT: s_waitcnt vmcnt(0) -; GX900-NEXT: v_mov_b32_e32 v1, v4 -; GX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: shuffle_v4f16_234u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX940-NEXT: global_load_dwordx2 v[6:7], v[2:3], off -; GFX940-NEXT: s_waitcnt vmcnt(1) -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: shuffle_v4f16_234u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_234u: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f16_234u: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 -; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off +; GFX11-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, ptr addrspace(1) %arg0 @@ -320,47 +309,43 @@ define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GX900-LABEL: shuffle_v4f16_357u: ; GX900: ; %bb.0: ; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GX900-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GX900-NEXT: s_mov_b32 s4, 0x7060302 -; GX900-NEXT: s_waitcnt vmcnt(1) -; GX900-NEXT: v_alignbit_b32 v1, s4, v5, 16 ; GX900-NEXT: s_waitcnt vmcnt(0) ; GX900-NEXT: v_perm_b32 v0, v4, v6, s4 +; GX900-NEXT: v_alignbit_b32 v1, s4, v5, 16 ; GX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: shuffle_v4f16_357u: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX940-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX940-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX940-NEXT: s_mov_b32 s0, 0x7060302 -; GFX940-NEXT: s_waitcnt vmcnt(1) -; GFX940-NEXT: v_alignbit_b32 v1, s0, v5, 16 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_perm_b32 v0, v4, v6, s0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v5, 16 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_357u: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_alignbit_b32 v1, s4, v5, 16 +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_perm_b32 v0, v4, v6, 0x7060302 +; GFX10-NEXT: v_alignbit_b32 v1, s4, v5, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f16_357u: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off -; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_alignbit_b32 v1, s0, v3, 16 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:4 +; GFX11-NEXT: global_load_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 +; GFX11-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 +; GFX11-NEXT: v_alignbit_b32 v1, s0, v1, 16 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, ptr addrspace(1) %arg0 %val1 = load <4 x half>, ptr addrspace(1) %arg1 @@ -1018,34 +1003,31 @@ define <4 x half> @shuffle_v4f16_3456(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX9-LABEL: shuffle_v4f16_3456: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_alignbit_b32 v1, v5, v4, 16 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_alignbit_b32 v0, v4, v6, 16 +; GFX9-NEXT: v_alignbit_b32 v1, v5, v4, 16 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_3456: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_alignbit_b32 v1, v5, v4, 16 +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v0, v4, v6, 16 +; GFX10-NEXT: v_alignbit_b32 v1, v5, v4, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f16_3456: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX11-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX11-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, ptr addrspace(1) %arg0 %val1 = load <4 x half>, ptr addrspace(1) %arg1 @@ -1057,12 +1039,11 @@ define <4 x half> @shuffle_v4f16_5634(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX9-LABEL: shuffle_v4f16_5634: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_alignbit_b32 v1, v4, v6, 16 +; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_5634: @@ -1233,7 +1214,7 @@ define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GX900-LABEL: shuffle_v4f16_0000: ; GX900: ; %bb.0: ; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GX900-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GX900-NEXT: global_load_dword v0, v[0:1], off ; GX900-NEXT: s_mov_b32 s4, 0x5040100 ; GX900-NEXT: s_waitcnt vmcnt(0) ; GX900-NEXT: v_perm_b32 v0, v0, v0, s4 @@ -1243,7 +1224,7 @@ define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX940-LABEL: shuffle_v4f16_0000: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX940-NEXT: global_load_dword v0, v[0:1], off ; GFX940-NEXT: s_mov_b32 s0, 0x5040100 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_perm_b32 v0, v0, v0, s0 @@ -1253,7 +1234,7 @@ define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX10-LABEL: shuffle_v4f16_0000: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -1262,7 +1243,7 @@ define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GFX11-LABEL: shuffle_v4f16_0000: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1905,43 +1886,39 @@ define <4 x half> @shuffle_v4f16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) % ; GX900-LABEL: shuffle_v4f16_0456: ; GX900: ; %bb.0: ; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GX900-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GX900-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GX900-NEXT: global_load_dword v6, v[0:1], off +; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GX900-NEXT: s_mov_b32 s4, 0x5040100 -; GX900-NEXT: ; kill: killed $vgpr0 killed $vgpr1 -; GX900-NEXT: ; kill: killed $vgpr2 killed $vgpr3 ; GX900-NEXT: s_waitcnt vmcnt(0) -; GX900-NEXT: v_perm_b32 v0, v5, v4, s4 -; GX900-NEXT: v_alignbit_b32 v1, v6, v5, 16 +; GX900-NEXT: v_perm_b32 v0, v4, v6, s4 +; GX900-NEXT: v_alignbit_b32 v1, v5, v4, 16 ; GX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: shuffle_v4f16_0456: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX940-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX940-NEXT: global_load_dword v6, v[0:1], off +; GFX940-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX940-NEXT: s_mov_b32 s0, 0x5040100 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_perm_b32 v0, v6, v4, s0 -; GFX940-NEXT: v_alignbit_b32 v1, v7, v6, 16 +; GFX940-NEXT: v_perm_b32 v0, v4, v6, s0 +; GFX940-NEXT: v_alignbit_b32 v1, v5, v4, 16 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4f16_0456: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off -; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1 -; GFX10-NEXT: ; kill: killed $vgpr2 killed $vgpr3 +; GFX10-NEXT: global_load_dword v6, v[0:1], off +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040100 -; GFX10-NEXT: v_alignbit_b32 v1, v6, v5, 16 +; GFX10-NEXT: v_perm_b32 v0, v4, v6, 0x5040100 +; GFX10-NEXT: v_alignbit_b32 v1, v5, v4, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4f16_0456: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 @@ -3177,44 +3154,33 @@ define <4 x bfloat> @shuffle_v4bf16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1 } define <4 x bfloat> @shuffle_v4bf16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) { -; GX900-LABEL: shuffle_v4bf16_234u: -; GX900: ; %bb.0: -; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GX900-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GX900-NEXT: s_waitcnt vmcnt(1) -; GX900-NEXT: v_mov_b32_e32 v0, v6 -; GX900-NEXT: s_waitcnt vmcnt(0) -; GX900-NEXT: v_mov_b32_e32 v1, v4 -; GX900-NEXT: s_setpc_b64 s[30:31] -; -; GFX940-LABEL: shuffle_v4bf16_234u: -; GFX940: ; %bb.0: -; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX940-NEXT: global_load_dwordx2 v[6:7], v[2:3], off -; GFX940-NEXT: s_waitcnt vmcnt(1) -; GFX940-NEXT: v_mov_b32_e32 v0, v4 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: shuffle_v4bf16_234u: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4bf16_234u: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX10-NEXT: global_load_dword v5, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_mov_b32_e32 v0, v6 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4bf16_234u: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 -; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off +; GFX11-NEXT: global_load_b32 v1, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 @@ -3465,47 +3431,43 @@ define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1 ; GX900-LABEL: shuffle_v4bf16_357u: ; GX900: ; %bb.0: ; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GX900-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GX900-NEXT: s_mov_b32 s4, 0x7060302 -; GX900-NEXT: s_waitcnt vmcnt(1) -; GX900-NEXT: v_alignbit_b32 v1, s4, v5, 16 ; GX900-NEXT: s_waitcnt vmcnt(0) ; GX900-NEXT: v_perm_b32 v0, v4, v6, s4 +; GX900-NEXT: v_alignbit_b32 v1, s4, v5, 16 ; GX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: shuffle_v4bf16_357u: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX940-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX940-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX940-NEXT: s_mov_b32 s0, 0x7060302 -; GFX940-NEXT: s_waitcnt vmcnt(1) -; GFX940-NEXT: v_alignbit_b32 v1, s0, v5, 16 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_perm_b32 v0, v4, v6, s0 +; GFX940-NEXT: v_alignbit_b32 v1, s0, v5, 16 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4bf16_357u: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_alignbit_b32 v1, s4, v5, 16 +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_perm_b32 v0, v4, v6, 0x7060302 +; GFX10-NEXT: v_alignbit_b32 v1, s4, v5, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4bf16_357u: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off -; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_alignbit_b32 v1, s0, v3, 16 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:4 +; GFX11-NEXT: global_load_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 +; GFX11-NEXT: v_perm_b32 v0, v0, v4, 0x7060302 +; GFX11-NEXT: v_alignbit_b32 v1, s0, v1, 16 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 @@ -4163,34 +4125,31 @@ define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX9-LABEL: shuffle_v4bf16_3456: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_alignbit_b32 v1, v5, v4, 16 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_alignbit_b32 v0, v4, v6, 16 +; GFX9-NEXT: v_alignbit_b32 v1, v5, v4, 16 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4bf16_3456: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_alignbit_b32 v1, v5, v4, 16 +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v0, v4, v6, 16 +; GFX10-NEXT: v_alignbit_b32 v1, v5, v4, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4bf16_3456: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off ; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GFX11-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX11-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0 %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1 @@ -4202,12 +4161,11 @@ define <4 x bfloat> @shuffle_v4bf16_5634(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX9-LABEL: shuffle_v4bf16_5634: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_alignbit_b32 v1, v4, v6, 16 +; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4bf16_5634: @@ -4293,7 +4251,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1 ; GX900-LABEL: shuffle_v4bf16_0000: ; GX900: ; %bb.0: ; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GX900-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GX900-NEXT: global_load_dword v0, v[0:1], off ; GX900-NEXT: s_mov_b32 s4, 0x5040100 ; GX900-NEXT: s_waitcnt vmcnt(0) ; GX900-NEXT: v_perm_b32 v0, v0, v0, s4 @@ -4303,7 +4261,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX940-LABEL: shuffle_v4bf16_0000: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX940-NEXT: global_load_dword v0, v[0:1], off ; GFX940-NEXT: s_mov_b32 s0, 0x5040100 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_perm_b32 v0, v0, v0, s0 @@ -4313,7 +4271,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX10-LABEL: shuffle_v4bf16_0000: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 @@ -4322,7 +4280,7 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1 ; GFX11-LABEL: shuffle_v4bf16_0000: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x5040100 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -5235,43 +5193,39 @@ define <4 x bfloat> @shuffle_v4bf16_0456(ptr addrspace(1) %arg0, ptr addrspace(1 ; GX900-LABEL: shuffle_v4bf16_0456: ; GX900: ; %bb.0: ; GX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GX900-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GX900-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GX900-NEXT: global_load_dword v6, v[0:1], off +; GX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GX900-NEXT: s_mov_b32 s4, 0x5040100 -; GX900-NEXT: ; kill: killed $vgpr0 killed $vgpr1 -; GX900-NEXT: ; kill: killed $vgpr2 killed $vgpr3 ; GX900-NEXT: s_waitcnt vmcnt(0) -; GX900-NEXT: v_perm_b32 v0, v5, v4, s4 -; GX900-NEXT: v_alignbit_b32 v1, v6, v5, 16 +; GX900-NEXT: v_perm_b32 v0, v4, v6, s4 +; GX900-NEXT: v_alignbit_b32 v1, v5, v4, 16 ; GX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: shuffle_v4bf16_0456: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX940-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX940-NEXT: global_load_dword v6, v[0:1], off +; GFX940-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX940-NEXT: s_mov_b32 s0, 0x5040100 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_perm_b32 v0, v6, v4, s0 -; GFX940-NEXT: v_alignbit_b32 v1, v7, v6, 16 +; GFX940-NEXT: v_perm_b32 v0, v4, v6, s0 +; GFX940-NEXT: v_alignbit_b32 v1, v5, v4, 16 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: shuffle_v4bf16_0456: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off -; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1 -; GFX10-NEXT: ; kill: killed $vgpr2 killed $vgpr3 +; GFX10-NEXT: global_load_dword v6, v[0:1], off +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040100 -; GFX10-NEXT: v_alignbit_b32 v1, v6, v5, 16 +; GFX10-NEXT: v_perm_b32 v0, v4, v6, 0x5040100 +; GFX10-NEXT: v_alignbit_b32 v1, v5, v4, 16 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: shuffle_v4bf16_0456: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 diff --git a/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll b/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll index 8186f6c9b42fb..695b0a796eb43 100644 --- a/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll +++ b/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll @@ -8,9 +8,10 @@ define i32 @foo(ptr %descs, i32 %num, i32 %cw) local_unnamed_addr #0 { ; CHECK-LABEL: foo: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldr d16, [r0, #32] +; CHECK-NEXT: add r0, r0, #32 +; CHECK-NEXT: vld1.32 {d16[1]}, [r0:32] ; CHECK-NEXT: vadd.i32 d16, d16, d16 -; CHECK-NEXT: vmov.32 r0, d16[0] +; CHECK-NEXT: vmov.32 r0, d16[1] ; CHECK-NEXT: bx lr entry: %wide.vec = load <16 x i32>, ptr %descs, align 4 diff --git a/llvm/test/CodeGen/ARM/vector-promotion.ll b/llvm/test/CodeGen/ARM/vector-promotion.ll index 344014ad80449..c3889ccfec7db 100644 --- a/llvm/test/CodeGen/ARM/vector-promotion.ll +++ b/llvm/test/CodeGen/ARM/vector-promotion.ll @@ -44,7 +44,7 @@ define void @unsupportedInstructionForPromotion(ptr %addr1, i32 %in2, ptr %dest) ; IR-BOTH-LABEL: @unsupportedChainInDifferentBBs -; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, ptr %addr1 +; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load volatile <2 x i32>, ptr %addr1 ; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 0 ; IR-BOTH-NEXT: br i1 %bool, label %bb2, label %end ; BB2 @@ -58,10 +58,10 @@ define void @unsupportedInstructionForPromotion(ptr %addr1, i32 %in2, ptr %dest) ; ASM: bx define void @unsupportedChainInDifferentBBs(ptr %addr1, ptr %dest, i1 %bool) { bb1: - %in1 = load <2 x i32>, ptr %addr1, align 8 + %in1 = load volatile <2 x i32>, ptr %addr1, align 8 %extract = extractelement <2 x i32> %in1, i32 0 br i1 %bool, label %bb2, label %end -bb2: +bb2: %out = or i32 %extract, 1 store i32 %out, ptr %dest, align 4 br label %end @@ -150,7 +150,7 @@ define void @udivCase(ptr %addr1, ptr %dest) { ; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1 ; ; IR-BOTH-NEXT: store i32 [[RES]], ptr %dest -; IR-BOTH-NEXT: ret +; IR-BOTH-NEXT: ret define void @uremCase(ptr %addr1, ptr %dest) { %in1 = load <2 x i32>, ptr %addr1, align 8 %extract = extractelement <2 x i32> %in1, i32 1 @@ -169,7 +169,7 @@ define void @uremCase(ptr %addr1, ptr %dest) { ; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1 ; ; IR-BOTH-NEXT: store i32 [[RES]], ptr %dest -; IR-BOTH-NEXT: ret +; IR-BOTH-NEXT: ret define void @sdivCase(ptr %addr1, ptr %dest) { %in1 = load <2 x i32>, ptr %addr1, align 8 %extract = extractelement <2 x i32> %in1, i32 1 @@ -188,7 +188,7 @@ define void @sdivCase(ptr %addr1, ptr %dest) { ; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1 ; ; IR-BOTH-NEXT: store i32 [[RES]], ptr %dest -; IR-BOTH-NEXT: ret +; IR-BOTH-NEXT: ret define void @sremCase(ptr %addr1, ptr %dest) { %in1 = load <2 x i32>, ptr %addr1, align 8 %extract = extractelement <2 x i32> %in1, i32 1 @@ -199,7 +199,7 @@ define void @sremCase(ptr %addr1, ptr %dest) { ; IR-BOTH-LABEL: @fdivCase ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>, ptr %addr1 -; Scalar version: +; Scalar version: ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = fdiv float [[EXTRACT]], 7.0 ; Vector version: @@ -209,7 +209,7 @@ define void @sremCase(ptr %addr1, ptr %dest) { ; IR-BOTH-NEXT: store float [[RES]], ptr %dest ; IR-BOTH-NEXT: ret define void @fdivCase(ptr %addr1, ptr %dest) { - %in1 = load <2 x float>, ptr %addr1, align 8 + %in1 = load <2 x float>, ptr %addr1, align 8 %extract = extractelement <2 x float> %in1, i32 1 %out = fdiv float %extract, 7.0 store float %out, ptr %dest, align 4 @@ -218,7 +218,7 @@ define void @fdivCase(ptr %addr1, ptr %dest) { ; IR-BOTH-LABEL: @fremCase ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>, ptr %addr1 -; Scalar version: +; Scalar version: ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem float [[EXTRACT]], 7.0 ; Vector version: @@ -228,7 +228,7 @@ define void @fdivCase(ptr %addr1, ptr %dest) { ; IR-BOTH-NEXT: store float [[RES]], ptr %dest ; IR-BOTH-NEXT: ret define void @fremCase(ptr %addr1, ptr %dest) { - %in1 = load <2 x float>, ptr %addr1, align 8 + %in1 = load <2 x float>, ptr %addr1, align 8 %extract = extractelement <2 x float> %in1, i32 1 %out = frem float %extract, 7.0 store float %out, ptr %dest, align 4 @@ -272,7 +272,7 @@ define void @undefRemCase(ptr %addr1, ptr %dest) { ; flag is set. ; IR-BOTH-LABEL: @undefConstantFRemCaseWithFastMath ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>, ptr %addr1 -; Scalar version: +; Scalar version: ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem nnan float [[EXTRACT]], 7.0 ; Vector version: @@ -282,7 +282,7 @@ define void @undefRemCase(ptr %addr1, ptr %dest) { ; IR-BOTH-NEXT: store float [[RES]], ptr %dest ; IR-BOTH-NEXT: ret define void @undefConstantFRemCaseWithFastMath(ptr %addr1, ptr %dest) { - %in1 = load <2 x float>, ptr %addr1, align 8 + %in1 = load <2 x float>, ptr %addr1, align 8 %extract = extractelement <2 x float> %in1, i32 1 %out = frem nnan float %extract, 7.0 store float %out, ptr %dest, align 4 @@ -293,7 +293,7 @@ define void @undefConstantFRemCaseWithFastMath(ptr %addr1, ptr %dest) { ; flag is set. ; IR-BOTH-LABEL: @undefVectorFRemCaseWithFastMath ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>, ptr %addr1 -; Scalar version: +; Scalar version: ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem nnan float 7.000000e+00, [[EXTRACT]] ; Vector version: @@ -303,7 +303,7 @@ define void @undefConstantFRemCaseWithFastMath(ptr %addr1, ptr %dest) { ; IR-BOTH-NEXT: store float [[RES]], ptr %dest ; IR-BOTH-NEXT: ret define void @undefVectorFRemCaseWithFastMath(ptr %addr1, ptr %dest) { - %in1 = load <2 x float>, ptr %addr1, align 8 + %in1 = load <2 x float>, ptr %addr1, align 8 %extract = extractelement <2 x float> %in1, i32 1 %out = frem nnan float 7.0, %extract store float %out, ptr %dest, align 4 @@ -315,7 +315,7 @@ define void @undefVectorFRemCaseWithFastMath(ptr %addr1, ptr %dest) { ; not promote on armv7. ; IR-BOTH-LABEL: @simpleOneInstructionPromotionFloat ; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>, ptr %addr1 -; Scalar version: +; Scalar version: ; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1 ; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = fadd float [[EXTRACT]], 1.0 ; Vector version: diff --git a/llvm/test/CodeGen/ARM/vext.ll b/llvm/test/CodeGen/ARM/vext.ll index 46f778d3c2c91..6c545823c466d 100644 --- a/llvm/test/CodeGen/ARM/vext.ll +++ b/llvm/test/CodeGen/ARM/vext.ll @@ -76,9 +76,10 @@ define <4 x i16> @test_vextd16(ptr %A, ptr %B) nounwind { define <4 x i32> @test_vextq32(ptr %A, ptr %B) nounwind { ; CHECK-LABEL: test_vextq32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-NEXT: vld1.64 {d18, d19}, [r0] -; CHECK-NEXT: vext.32 q8, q9, q8, #3 +; CHECK-NEXT: add r0, r0, #12 +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NEXT: vld1.32 {d17[1]}, [r0:32] +; CHECK-NEXT: vext.32 q8, q8, q9, #3 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr diff --git a/llvm/test/CodeGen/ARM/vuzp.ll b/llvm/test/CodeGen/ARM/vuzp.ll index 7e1dfba34db2e..0c162036e21d8 100644 --- a/llvm/test/CodeGen/ARM/vuzp.ll +++ b/llvm/test/CodeGen/ARM/vuzp.ll @@ -285,13 +285,13 @@ entry: define <4 x i32> @vuzp_lower_shufflemask_zeroed(ptr %A, ptr %B) { ; CHECK-LABEL: vuzp_lower_shufflemask_zeroed: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldr d17, [r0] -; CHECK-NEXT: vorr d18, d17, d17 -; CHECK-NEXT: vldr d16, [r1] -; CHECK-NEXT: vdup.32 d17, d17[0] -; CHECK-NEXT: vtrn.32 d18, d16 -; CHECK-NEXT: vmov r0, r1, d17 -; CHECK-NEXT: vmov r2, r3, d16 +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: add r0, r1, #4 +; CHECK-NEXT: vld1.32 {d17[1]}, [r0:32] +; CHECK-NEXT: vdup.32 d18, d16[0] +; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vtrn.32 d16, d17 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr entry: %tmp1 = load <2 x i32>, ptr %A diff --git a/llvm/test/CodeGen/Mips/cconv/vector.ll b/llvm/test/CodeGen/Mips/cconv/vector.ll index 383e5ef19cebf..09e2cb89be618 100644 --- a/llvm/test/CodeGen/Mips/cconv/vector.ll +++ b/llvm/test/CodeGen/Mips/cconv/vector.ll @@ -5513,28 +5513,28 @@ define float @mixed_i8(<2 x float> %a, i8 %b, <2 x float> %c) { ; MIPS32R5-NEXT: jr $ra ; MIPS32R5-NEXT: nop ; -; MIPS64R5-LABEL: mixed_i8: -; MIPS64R5: # %bb.0: # %entry -; MIPS64R5-NEXT: daddiu $sp, $sp, -48 -; MIPS64R5-NEXT: .cfi_def_cfa_offset 48 -; MIPS64R5-NEXT: sll $1, $5, 0 -; MIPS64R5-NEXT: andi $1, $1, 255 -; MIPS64R5-NEXT: mtc1 $1, $f0 -; MIPS64R5-NEXT: cvt.s.w $f0, $f0 -; MIPS64R5-NEXT: swc1 $f0, 36($sp) -; MIPS64R5-NEXT: swc1 $f0, 32($sp) -; MIPS64R5-NEXT: sd $4, 0($sp) -; MIPS64R5-NEXT: ld.w $w0, 0($sp) -; MIPS64R5-NEXT: ld.w $w1, 32($sp) -; MIPS64R5-NEXT: fadd.w $w0, $w1, $w0 -; MIPS64R5-NEXT: sd $6, 16($sp) -; MIPS64R5-NEXT: ld.w $w1, 16($sp) -; MIPS64R5-NEXT: fadd.w $w0, $w0, $w1 -; MIPS64R5-NEXT: splati.w $w1, $w0[1] -; MIPS64R5-NEXT: add.s $f0, $f0, $f1 -; MIPS64R5-NEXT: daddiu $sp, $sp, 48 -; MIPS64R5-NEXT: jr $ra -; MIPS64R5-NEXT: nop +; MIPS64R5EB-LABEL: mixed_i8: +; MIPS64R5EB: # %bb.0: # %entry +; MIPS64R5EB-NEXT: daddiu $sp, $sp, -48 +; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 48 +; MIPS64R5EB-NEXT: sll $1, $5, 0 +; MIPS64R5EB-NEXT: andi $1, $1, 255 +; MIPS64R5EB-NEXT: mtc1 $1, $f0 +; MIPS64R5EB-NEXT: cvt.s.w $f0, $f0 +; MIPS64R5EB-NEXT: swc1 $f0, 36($sp) +; MIPS64R5EB-NEXT: swc1 $f0, 32($sp) +; MIPS64R5EB-NEXT: insert.d $w0[0], $4 +; MIPS64R5EB-NEXT: shf.w $w0, $w0, 177 +; MIPS64R5EB-NEXT: ld.w $w1, 32($sp) +; MIPS64R5EB-NEXT: fadd.w $w0, $w1, $w0 +; MIPS64R5EB-NEXT: insert.d $w1[0], $6 +; MIPS64R5EB-NEXT: shf.w $w1, $w1, 177 +; MIPS64R5EB-NEXT: fadd.w $w0, $w0, $w1 +; MIPS64R5EB-NEXT: splati.w $w1, $w0[1] +; MIPS64R5EB-NEXT: add.s $f0, $f0, $f1 +; MIPS64R5EB-NEXT: daddiu $sp, $sp, 48 +; MIPS64R5EB-NEXT: jr $ra +; MIPS64R5EB-NEXT: nop ; ; MIPS64EL-LABEL: mixed_i8: ; MIPS64EL: # %bb.0: # %entry @@ -5559,6 +5559,27 @@ define float @mixed_i8(<2 x float> %a, i8 %b, <2 x float> %c) { ; MIPS64EL-NEXT: add.s $f0, $f1, $f0 ; MIPS64EL-NEXT: jr $ra ; MIPS64EL-NEXT: nop +; +; MIPS64R5EL-LABEL: mixed_i8: +; MIPS64R5EL: # %bb.0: # %entry +; MIPS64R5EL-NEXT: daddiu $sp, $sp, -48 +; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 48 +; MIPS64R5EL-NEXT: sll $1, $5, 0 +; MIPS64R5EL-NEXT: andi $1, $1, 255 +; MIPS64R5EL-NEXT: mtc1 $1, $f0 +; MIPS64R5EL-NEXT: cvt.s.w $f0, $f0 +; MIPS64R5EL-NEXT: swc1 $f0, 36($sp) +; MIPS64R5EL-NEXT: swc1 $f0, 32($sp) +; MIPS64R5EL-NEXT: insert.d $w0[0], $4 +; MIPS64R5EL-NEXT: ld.w $w1, 32($sp) +; MIPS64R5EL-NEXT: fadd.w $w0, $w1, $w0 +; MIPS64R5EL-NEXT: insert.d $w1[0], $6 +; MIPS64R5EL-NEXT: fadd.w $w0, $w0, $w1 +; MIPS64R5EL-NEXT: splati.w $w1, $w0[1] +; MIPS64R5EL-NEXT: add.s $f0, $f0, $f1 +; MIPS64R5EL-NEXT: daddiu $sp, $sp, 48 +; MIPS64R5EL-NEXT: jr $ra +; MIPS64R5EL-NEXT: nop entry: %0 = zext i8 %b to i32 %1 = uitofp i32 %0 to float diff --git a/llvm/test/CodeGen/Mips/msa/basic_operations.ll b/llvm/test/CodeGen/Mips/msa/basic_operations.ll index 4fc3f57aa002d..7c8c31c0ec181 100644 --- a/llvm/test/CodeGen/Mips/msa/basic_operations.ll +++ b/llvm/test/CodeGen/Mips/msa/basic_operations.ll @@ -1014,34 +1014,54 @@ define i32 @extract_sext_v4i32() nounwind { ; O32: # %bb.0: ; O32-NEXT: lui $2, %hi(_gp_disp) ; O32-NEXT: addiu $2, $2, %lo(_gp_disp) +; O32-NEXT: addiu $sp, $sp, -32 +; O32-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill +; O32-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill +; O32-NEXT: move $fp, $sp +; O32-NEXT: addiu $1, $zero, -16 +; O32-NEXT: and $sp, $sp, $1 ; O32-NEXT: addu $1, $2, $25 ; O32-NEXT: lw $1, %got(v4i32)($1) -; O32-NEXT: ld.w $w0, 0($1) +; O32-NEXT: lw $1, 4($1) +; O32-NEXT: sw $1, 4($sp) +; O32-NEXT: ld.w $w0, 0($sp) ; O32-NEXT: addv.w $w0, $w0, $w0 -; O32-NEXT: jr $ra ; O32-NEXT: copy_s.w $2, $w0[1] +; O32-NEXT: move $sp, $fp +; O32-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload +; O32-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload +; O32-NEXT: jr $ra +; O32-NEXT: addiu $sp, $sp, 32 ; ; N32-LABEL: extract_sext_v4i32: ; N32: # %bb.0: +; N32-NEXT: addiu $sp, $sp, -16 ; N32-NEXT: lui $1, %hi(%neg(%gp_rel(extract_sext_v4i32))) ; N32-NEXT: addu $1, $1, $25 ; N32-NEXT: addiu $1, $1, %lo(%neg(%gp_rel(extract_sext_v4i32))) ; N32-NEXT: lw $1, %got_disp(v4i32)($1) -; N32-NEXT: ld.w $w0, 0($1) +; N32-NEXT: lw $1, 4($1) +; N32-NEXT: sw $1, 4($sp) +; N32-NEXT: ld.w $w0, 0($sp) ; N32-NEXT: addv.w $w0, $w0, $w0 -; N32-NEXT: jr $ra ; N32-NEXT: copy_s.w $2, $w0[1] +; N32-NEXT: jr $ra +; N32-NEXT: addiu $sp, $sp, 16 ; ; N64-LABEL: extract_sext_v4i32: ; N64: # %bb.0: +; N64-NEXT: daddiu $sp, $sp, -16 ; N64-NEXT: lui $1, %hi(%neg(%gp_rel(extract_sext_v4i32))) ; N64-NEXT: daddu $1, $1, $25 ; N64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(extract_sext_v4i32))) ; N64-NEXT: ld $1, %got_disp(v4i32)($1) -; N64-NEXT: ld.w $w0, 0($1) +; N64-NEXT: lw $1, 4($1) +; N64-NEXT: sw $1, 4($sp) +; N64-NEXT: ld.w $w0, 0($sp) ; N64-NEXT: addv.w $w0, $w0, $w0 -; N64-NEXT: jr $ra ; N64-NEXT: copy_s.w $2, $w0[1] +; N64-NEXT: jr $ra +; N64-NEXT: daddiu $sp, $sp, 16 %1 = load <4 x i32>, ptr @v4i32 %2 = add <4 x i32> %1, %1 %3 = extractelement <4 x i32> %2, i32 1 @@ -1076,25 +1096,33 @@ define i64 @extract_sext_v2i64() nounwind { ; ; N32-LABEL: extract_sext_v2i64: ; N32: # %bb.0: +; N32-NEXT: addiu $sp, $sp, -16 ; N32-NEXT: lui $1, %hi(%neg(%gp_rel(extract_sext_v2i64))) ; N32-NEXT: addu $1, $1, $25 ; N32-NEXT: addiu $1, $1, %lo(%neg(%gp_rel(extract_sext_v2i64))) ; N32-NEXT: lw $1, %got_disp(v2i64)($1) -; N32-NEXT: ld.d $w0, 0($1) +; N32-NEXT: ld $1, 8($1) +; N32-NEXT: sd $1, 8($sp) +; N32-NEXT: ld.d $w0, 0($sp) ; N32-NEXT: addv.d $w0, $w0, $w0 -; N32-NEXT: jr $ra ; N32-NEXT: copy_s.d $2, $w0[1] +; N32-NEXT: jr $ra +; N32-NEXT: addiu $sp, $sp, 16 ; ; N64-LABEL: extract_sext_v2i64: ; N64: # %bb.0: +; N64-NEXT: daddiu $sp, $sp, -16 ; N64-NEXT: lui $1, %hi(%neg(%gp_rel(extract_sext_v2i64))) ; N64-NEXT: daddu $1, $1, $25 ; N64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(extract_sext_v2i64))) ; N64-NEXT: ld $1, %got_disp(v2i64)($1) -; N64-NEXT: ld.d $w0, 0($1) +; N64-NEXT: ld $1, 8($1) +; N64-NEXT: sd $1, 8($sp) +; N64-NEXT: ld.d $w0, 0($sp) ; N64-NEXT: addv.d $w0, $w0, $w0 -; N64-NEXT: jr $ra ; N64-NEXT: copy_s.d $2, $w0[1] +; N64-NEXT: jr $ra +; N64-NEXT: daddiu $sp, $sp, 16 %1 = load <2 x i64>, ptr @v2i64 %2 = add <2 x i64> %1, %1 %3 = extractelement <2 x i64> %2, i32 1 @@ -1186,34 +1214,54 @@ define i32 @extract_zext_v4i32() nounwind { ; O32: # %bb.0: ; O32-NEXT: lui $2, %hi(_gp_disp) ; O32-NEXT: addiu $2, $2, %lo(_gp_disp) +; O32-NEXT: addiu $sp, $sp, -32 +; O32-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill +; O32-NEXT: sw $fp, 24($sp) # 4-byte Folded Spill +; O32-NEXT: move $fp, $sp +; O32-NEXT: addiu $1, $zero, -16 +; O32-NEXT: and $sp, $sp, $1 ; O32-NEXT: addu $1, $2, $25 ; O32-NEXT: lw $1, %got(v4i32)($1) -; O32-NEXT: ld.w $w0, 0($1) +; O32-NEXT: lw $1, 4($1) +; O32-NEXT: sw $1, 4($sp) +; O32-NEXT: ld.w $w0, 0($sp) ; O32-NEXT: addv.w $w0, $w0, $w0 -; O32-NEXT: jr $ra ; O32-NEXT: copy_s.w $2, $w0[1] +; O32-NEXT: move $sp, $fp +; O32-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload +; O32-NEXT: lw $ra, 28($sp) # 4-byte Folded Reload +; O32-NEXT: jr $ra +; O32-NEXT: addiu $sp, $sp, 32 ; ; N32-LABEL: extract_zext_v4i32: ; N32: # %bb.0: +; N32-NEXT: addiu $sp, $sp, -16 ; N32-NEXT: lui $1, %hi(%neg(%gp_rel(extract_zext_v4i32))) ; N32-NEXT: addu $1, $1, $25 ; N32-NEXT: addiu $1, $1, %lo(%neg(%gp_rel(extract_zext_v4i32))) ; N32-NEXT: lw $1, %got_disp(v4i32)($1) -; N32-NEXT: ld.w $w0, 0($1) +; N32-NEXT: lw $1, 4($1) +; N32-NEXT: sw $1, 4($sp) +; N32-NEXT: ld.w $w0, 0($sp) ; N32-NEXT: addv.w $w0, $w0, $w0 -; N32-NEXT: jr $ra ; N32-NEXT: copy_s.w $2, $w0[1] +; N32-NEXT: jr $ra +; N32-NEXT: addiu $sp, $sp, 16 ; ; N64-LABEL: extract_zext_v4i32: ; N64: # %bb.0: +; N64-NEXT: daddiu $sp, $sp, -16 ; N64-NEXT: lui $1, %hi(%neg(%gp_rel(extract_zext_v4i32))) ; N64-NEXT: daddu $1, $1, $25 ; N64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(extract_zext_v4i32))) ; N64-NEXT: ld $1, %got_disp(v4i32)($1) -; N64-NEXT: ld.w $w0, 0($1) +; N64-NEXT: lw $1, 4($1) +; N64-NEXT: sw $1, 4($sp) +; N64-NEXT: ld.w $w0, 0($sp) ; N64-NEXT: addv.w $w0, $w0, $w0 -; N64-NEXT: jr $ra ; N64-NEXT: copy_s.w $2, $w0[1] +; N64-NEXT: jr $ra +; N64-NEXT: daddiu $sp, $sp, 16 %1 = load <4 x i32>, ptr @v4i32 %2 = add <4 x i32> %1, %1 %3 = extractelement <4 x i32> %2, i32 1 @@ -1248,25 +1296,33 @@ define i64 @extract_zext_v2i64() nounwind { ; ; N32-LABEL: extract_zext_v2i64: ; N32: # %bb.0: +; N32-NEXT: addiu $sp, $sp, -16 ; N32-NEXT: lui $1, %hi(%neg(%gp_rel(extract_zext_v2i64))) ; N32-NEXT: addu $1, $1, $25 ; N32-NEXT: addiu $1, $1, %lo(%neg(%gp_rel(extract_zext_v2i64))) ; N32-NEXT: lw $1, %got_disp(v2i64)($1) -; N32-NEXT: ld.d $w0, 0($1) +; N32-NEXT: ld $1, 8($1) +; N32-NEXT: sd $1, 8($sp) +; N32-NEXT: ld.d $w0, 0($sp) ; N32-NEXT: addv.d $w0, $w0, $w0 -; N32-NEXT: jr $ra ; N32-NEXT: copy_s.d $2, $w0[1] +; N32-NEXT: jr $ra +; N32-NEXT: addiu $sp, $sp, 16 ; ; N64-LABEL: extract_zext_v2i64: ; N64: # %bb.0: +; N64-NEXT: daddiu $sp, $sp, -16 ; N64-NEXT: lui $1, %hi(%neg(%gp_rel(extract_zext_v2i64))) ; N64-NEXT: daddu $1, $1, $25 ; N64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(extract_zext_v2i64))) ; N64-NEXT: ld $1, %got_disp(v2i64)($1) -; N64-NEXT: ld.d $w0, 0($1) +; N64-NEXT: ld $1, 8($1) +; N64-NEXT: sd $1, 8($sp) +; N64-NEXT: ld.d $w0, 0($sp) ; N64-NEXT: addv.d $w0, $w0, $w0 -; N64-NEXT: jr $ra ; N64-NEXT: copy_s.d $2, $w0[1] +; N64-NEXT: jr $ra +; N64-NEXT: daddiu $sp, $sp, 16 %1 = load <2 x i64>, ptr @v2i64 %2 = add <2 x i64> %1, %1 %3 = extractelement <2 x i64> %2, i32 1 diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll index ca1b5fdabbf8f..3a82a8abd20c6 100644 --- a/llvm/test/CodeGen/NVPTX/i128.ll +++ b/llvm/test/CodeGen/NVPTX/i128.ll @@ -298,13 +298,13 @@ define i128 @srem_i128_pow2k(i128 %lhs) { define i128 @urem_i128_pow2k(i128 %lhs) { ; CHECK-LABEL: urem_i128_pow2k( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [urem_i128_pow2k_param_0]; -; CHECK-NEXT: and.b64 %rd3, %rd1, 8589934591; -; CHECK-NEXT: mov.b64 %rd4, 0; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd4}; +; CHECK-NEXT: ld.param.u64 %rd1, [urem_i128_pow2k_param_0]; +; CHECK-NEXT: and.b64 %rd2, %rd1, 8589934591; +; CHECK-NEXT: mov.b64 %rd3, 0; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd3}; ; CHECK-NEXT: ret; %div = urem i128 %lhs, 8589934592 ret i128 %div diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll index c2f166770a7ad..e1079814a8e7a 100644 --- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll @@ -268,19 +268,19 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: bfe.s32 %r9, %r2, 24, 8; ; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8; ; CHECK-NEXT: setp.gt.s32 %p4, %r10, %r9; -; CHECK-NEXT: bfe.u32 %r11, %r1, 0, 8; -; CHECK-NEXT: bfe.u32 %r12, %r1, 8, 8; -; CHECK-NEXT: bfe.u32 %r13, %r1, 16, 8; -; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; -; CHECK-NEXT: bfe.u32 %r15, %r2, 24, 8; -; CHECK-NEXT: selp.b32 %r16, %r14, %r15, %p4; -; CHECK-NEXT: bfe.u32 %r17, %r2, 16, 8; -; CHECK-NEXT: selp.b32 %r18, %r13, %r17, %p3; +; CHECK-NEXT: bfe.u32 %r11, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r12, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r13, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r14, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r15, %r1, 24, 8; +; CHECK-NEXT: selp.b32 %r16, %r15, %r14, %p4; +; CHECK-NEXT: bfe.u32 %r17, %r1, 16, 8; +; CHECK-NEXT: selp.b32 %r18, %r17, %r13, %p3; ; CHECK-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U; -; CHECK-NEXT: bfe.u32 %r20, %r2, 8, 8; -; CHECK-NEXT: selp.b32 %r21, %r12, %r20, %p2; -; CHECK-NEXT: bfe.u32 %r22, %r2, 0, 8; -; CHECK-NEXT: selp.b32 %r23, %r11, %r22, %p1; +; CHECK-NEXT: bfe.u32 %r20, %r1, 8, 8; +; CHECK-NEXT: selp.b32 %r21, %r20, %r12, %p2; +; CHECK-NEXT: bfe.u32 %r22, %r1, 0, 8; +; CHECK-NEXT: selp.b32 %r23, %r22, %r11, %p1; ; CHECK-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U; ; CHECK-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U; ; CHECK-NEXT: st.param.b32 [func_retval0], %r25; @@ -346,19 +346,19 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: bfe.s32 %r9, %r2, 24, 8; ; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8; ; CHECK-NEXT: setp.le.s32 %p4, %r10, %r9; -; CHECK-NEXT: bfe.u32 %r11, %r1, 0, 8; -; CHECK-NEXT: bfe.u32 %r12, %r1, 8, 8; -; CHECK-NEXT: bfe.u32 %r13, %r1, 16, 8; -; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; -; CHECK-NEXT: bfe.u32 %r15, %r2, 24, 8; -; CHECK-NEXT: selp.b32 %r16, %r14, %r15, %p4; -; CHECK-NEXT: bfe.u32 %r17, %r2, 16, 8; -; CHECK-NEXT: selp.b32 %r18, %r13, %r17, %p3; +; CHECK-NEXT: bfe.u32 %r11, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r12, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r13, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r14, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r15, %r1, 24, 8; +; CHECK-NEXT: selp.b32 %r16, %r15, %r14, %p4; +; CHECK-NEXT: bfe.u32 %r17, %r1, 16, 8; +; CHECK-NEXT: selp.b32 %r18, %r17, %r13, %p3; ; CHECK-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U; -; CHECK-NEXT: bfe.u32 %r20, %r2, 8, 8; -; CHECK-NEXT: selp.b32 %r21, %r12, %r20, %p2; -; CHECK-NEXT: bfe.u32 %r22, %r2, 0, 8; -; CHECK-NEXT: selp.b32 %r23, %r11, %r22, %p1; +; CHECK-NEXT: bfe.u32 %r20, %r1, 8, 8; +; CHECK-NEXT: selp.b32 %r21, %r20, %r12, %p2; +; CHECK-NEXT: bfe.u32 %r22, %r1, 0, 8; +; CHECK-NEXT: selp.b32 %r23, %r22, %r11, %p1; ; CHECK-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U; ; CHECK-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U; ; CHECK-NEXT: st.param.b32 [func_retval0], %r25; diff --git a/llvm/test/CodeGen/PowerPC/aix-vector-byval-callee.ll b/llvm/test/CodeGen/PowerPC/aix-vector-byval-callee.ll index 80c26471d8cdb..8eaa8d7713bcd 100644 --- a/llvm/test/CodeGen/PowerPC/aix-vector-byval-callee.ll +++ b/llvm/test/CodeGen/PowerPC/aix-vector-byval-callee.ll @@ -15,9 +15,9 @@ define i32 @vec_struct_test(i32 %i, ptr nocapture readonly byval(%struct.vec_str ; 32BIT: bb.0.entry: ; 32BIT-NEXT: liveins: $r3, $r5, $r6, $r7, $r8 ; 32BIT-NEXT: {{ $}} - ; 32BIT-NEXT: STW killed renamable $r7, 8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8, align 8) - ; 32BIT-NEXT: STW killed renamable $r6, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4) ; 32BIT-NEXT: STW renamable $r5, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 16) + ; 32BIT-NEXT: STW killed renamable $r6, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4) + ; 32BIT-NEXT: STW killed renamable $r7, 8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8, align 8) ; 32BIT-NEXT: STW killed renamable $r8, 12, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 12) ; 32BIT-NEXT: renamable $r3 = nsw ADD4 killed renamable $r5, killed renamable $r3 ; 32BIT-NEXT: BLR implicit $lr, implicit $rm, implicit $r3 diff --git a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll index 7f6fdc7f88cd1..b5607e3d91e10 100644 --- a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll +++ b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll @@ -1105,16 +1105,11 @@ define <2 x i64> @testSplati64_1(ptr nocapture readonly %ptr) #0 { ; ; CHECK-NOVSX-LABEL: testSplati64_1: ; CHECK-NOVSX: # %bb.0: # %entry -; CHECK-NOVSX-NEXT: ld r4, 8(r3) -; CHECK-NOVSX-NEXT: std r4, -8(r1) -; CHECK-NOVSX-NEXT: ld r3, 0(r3) +; CHECK-NOVSX-NEXT: ld r3, 8(r3) +; CHECK-NOVSX-NEXT: std r3, -8(r1) ; CHECK-NOVSX-NEXT: std r3, -16(r1) ; CHECK-NOVSX-NEXT: addi r3, r1, -16 ; CHECK-NOVSX-NEXT: lvx v2, 0, r3 -; CHECK-NOVSX-NEXT: addis r3, r2, .LCPI21_0@toc@ha -; CHECK-NOVSX-NEXT: addi r3, r3, .LCPI21_0@toc@l -; CHECK-NOVSX-NEXT: lvx v3, 0, r3 -; CHECK-NOVSX-NEXT: vperm v2, v2, v2, v3 ; CHECK-NOVSX-NEXT: blr ; ; CHECK-P7-LABEL: testSplati64_1: diff --git a/llvm/test/CodeGen/PowerPC/const-stov.ll b/llvm/test/CodeGen/PowerPC/const-stov.ll index 69c68a4f27371..c32c1ff1fc06b 100644 --- a/llvm/test/CodeGen/PowerPC/const-stov.ll +++ b/llvm/test/CodeGen/PowerPC/const-stov.ll @@ -132,28 +132,29 @@ entry: define <2 x i64> @i64(ptr nocapture noundef readonly %p) { ; PWR7-BE-LABEL: i64: ; PWR7-BE: # %bb.0: # %entry -; PWR7-BE-NEXT: lxvd2x v2, 0, r3 +; PWR7-BE-NEXT: lfd f0, 0(r3) ; PWR7-BE-NEXT: li r3, 10 +; PWR7-BE-NEXT: xxpermdi v2, vs0, vs0, 1 ; PWR7-BE-NEXT: std r3, -16(r1) -; PWR7-BE-NEXT: std r3, -8(r1) -; PWR7-BE-NEXT: addi r3, r1, -16 -; PWR7-BE-NEXT: lxvd2x v3, 0, r3 +; PWR7-BE-NEXT: lfd f0, -16(r1) +; PWR7-BE-NEXT: xxpermdi v3, vs0, vs0, 1 ; PWR7-BE-NEXT: xxmrghd v2, v2, v3 ; PWR7-BE-NEXT: blr ; ; PWR8-BE-LABEL: i64: ; PWR8-BE: # %bb.0: # %entry -; PWR8-BE-NEXT: lxvd2x v2, 0, r3 +; PWR8-BE-NEXT: lfd f0, 0(r3) ; PWR8-BE-NEXT: li r3, 10 +; PWR8-BE-NEXT: xxpermdi v2, vs0, vs0, 1 ; PWR8-BE-NEXT: mtfprd f0, r3 ; PWR8-BE-NEXT: xxmrghd v2, v2, vs0 ; PWR8-BE-NEXT: blr ; ; PWR8-LE-LABEL: i64: ; PWR8-LE: # %bb.0: # %entry -; PWR8-LE-NEXT: lxvd2x vs0, 0, r3 +; PWR8-LE-NEXT: lfd f0, 0(r3) ; PWR8-LE-NEXT: li r3, 10 -; PWR8-LE-NEXT: xxswapd v2, vs0 +; PWR8-LE-NEXT: xxspltd v2, vs0, 0 ; PWR8-LE-NEXT: mtfprd f0, r3 ; PWR8-LE-NEXT: xxpermdi v2, vs0, v2, 1 ; PWR8-LE-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/pr27078.ll b/llvm/test/CodeGen/PowerPC/pr27078.ll index ee4d4ff9c6c79..beb63ce0127bc 100644 --- a/llvm/test/CodeGen/PowerPC/pr27078.ll +++ b/llvm/test/CodeGen/PowerPC/pr27078.ll @@ -4,19 +4,21 @@ define <4 x float> @bar(ptr %p, ptr %q) { ; CHECK-LABEL: bar: ; CHECK: # %bb.0: -; CHECK-NEXT: li 5, 16 -; CHECK-NEXT: lxvw4x 1, 0, 3 -; CHECK-NEXT: lxvw4x 3, 0, 4 -; CHECK-NEXT: xvsubsp 35, 3, 1 -; CHECK-NEXT: lxvw4x 0, 3, 5 -; CHECK-NEXT: lxvw4x 2, 4, 5 +; CHECK-NEXT: li 5, 24 +; CHECK-NEXT: lxvw4x 1, 0, 4 +; CHECK-NEXT: lfiwzx 0, 3, 5 +; CHECK-NEXT: xxmrghw 34, 0, 0 +; CHECK-NEXT: lfiwzx 0, 4, 5 ; CHECK-NEXT: addis 5, 2, .LCPI0_0@toc@ha ; CHECK-NEXT: addi 5, 5, .LCPI0_0@toc@l ; CHECK-NEXT: lxvw4x 36, 0, 5 -; CHECK-NEXT: li 5, 32 -; CHECK-NEXT: xvsubsp 34, 2, 0 -; CHECK-NEXT: lxvw4x 0, 3, 5 -; CHECK-NEXT: lxvw4x 1, 4, 5 +; CHECK-NEXT: li 5, 36 +; CHECK-NEXT: xxmrghw 35, 0, 0 +; CHECK-NEXT: lxvw4x 0, 0, 3 +; CHECK-NEXT: xvsubsp 34, 35, 34 +; CHECK-NEXT: xvsubsp 35, 1, 0 +; CHECK-NEXT: lfiwzx 0, 3, 5 +; CHECK-NEXT: lfiwzx 1, 4, 5 ; CHECK-NEXT: addis 3, 2, .LCPI0_1@toc@ha ; CHECK-NEXT: addi 3, 3, .LCPI0_1@toc@l ; CHECK-NEXT: vperm 2, 3, 2, 4 diff --git a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll index 4435484ae0b94..1a897f3498ab9 100644 --- a/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll +++ b/llvm/test/CodeGen/PowerPC/pre-inc-disable.ll @@ -317,24 +317,28 @@ define void @test16(ptr nocapture readonly %sums, i32 signext %delta, i32 signex ; P9BE-AIX32-LABEL: test16: ; P9BE-AIX32: # %bb.0: # %entry ; P9BE-AIX32-NEXT: slwi 4, 4, 1 -; P9BE-AIX32-NEXT: li 6, 0 ; P9BE-AIX32-NEXT: lhzux 4, 3, 4 ; P9BE-AIX32-NEXT: lhz 3, 16(3) -; P9BE-AIX32-NEXT: sth 6, -64(1) -; P9BE-AIX32-NEXT: lxv 2, -64(1) ; P9BE-AIX32-NEXT: sth 4, -48(1) -; P9BE-AIX32-NEXT: lxv 4, -48(1) ; P9BE-AIX32-NEXT: sth 3, -32(1) +; P9BE-AIX32-NEXT: li 3, 0 +; P9BE-AIX32-NEXT: sth 3, -64(1) +; P9BE-AIX32-NEXT: lwz 3, -32(1) +; P9BE-AIX32-NEXT: lxv 3, -64(1) +; P9BE-AIX32-NEXT: mtfprwz 0, 3 +; P9BE-AIX32-NEXT: lwz 3, -48(1) +; P9BE-AIX32-NEXT: xxinsertw 2, 0, 0 +; P9BE-AIX32-NEXT: mtfprwz 0, 3 ; P9BE-AIX32-NEXT: lwz 3, L..C3(2) # %const.0 -; P9BE-AIX32-NEXT: lxv 3, -32(1) -; P9BE-AIX32-NEXT: vmrghh 4, 2, 4 +; P9BE-AIX32-NEXT: vmrghh 2, 3, 2 +; P9BE-AIX32-NEXT: xxinsertw 4, 0, 0 +; P9BE-AIX32-NEXT: vmrghh 4, 3, 4 +; P9BE-AIX32-NEXT: vsplth 3, 3, 0 ; P9BE-AIX32-NEXT: lxv 0, 0(3) -; P9BE-AIX32-NEXT: vmrghh 3, 2, 3 -; P9BE-AIX32-NEXT: vsplth 2, 2, 0 -; P9BE-AIX32-NEXT: xxmrghw 2, 2, 4 -; P9BE-AIX32-NEXT: xxperm 3, 2, 0 -; P9BE-AIX32-NEXT: xxspltw 2, 3, 1 -; P9BE-AIX32-NEXT: vadduwm 2, 3, 2 +; P9BE-AIX32-NEXT: xxmrghw 3, 3, 4 +; P9BE-AIX32-NEXT: xxperm 2, 3, 0 +; P9BE-AIX32-NEXT: xxspltw 3, 2, 1 +; P9BE-AIX32-NEXT: vadduwm 2, 2, 3 ; P9BE-AIX32-NEXT: stxv 2, -16(1) ; P9BE-AIX32-NEXT: lwz 3, -16(1) ; P9BE-AIX32-NEXT: cmpw 3, 5 diff --git a/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll index 3ab49cd39f8d8..cefe5ad7b9e77 100644 --- a/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll +++ b/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll @@ -251,9 +251,13 @@ define <16 x i8> @test_none_v16i8(i8 %arg, ptr nocapture noundef readonly %b) { ; ; CHECK-AIX-32-P9-LABEL: test_none_v16i8: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-32-P9-NEXT: lwz r4, 0(r4) ; CHECK-AIX-32-P9-NEXT: stb r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r4 +; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 0 +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-32-P9-NEXT: xxinsertw v3, vs0, 0 ; CHECK-AIX-32-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -331,8 +335,12 @@ define <16 x i8> @test_v16i8_v8i16(i16 %arg, i8 %arg1) { ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: stb r4, -32(r1) ; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v2, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv v3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lwz r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r4 +; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 0 +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-32-P9-NEXT: xxinsertw v3, vs0, 0 ; CHECK-AIX-32-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -411,8 +419,12 @@ define <16 x i8> @test_v8i16_v16i8(i16 %arg, i8 %arg1) { ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: stb r4, -32(r1) ; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v2, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv v3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lwz r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r4 +; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 0 +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-32-P9-NEXT: xxinsertw v3, vs0, 0 ; CHECK-AIX-32-P9-NEXT: vmrghh v2, v3, v2 ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -494,9 +506,13 @@ define <16 x i8> @test_none_v8i16(i16 %arg, ptr nocapture noundef readonly %b) { ; ; CHECK-AIX-32-P9-LABEL: test_none_v8i16: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-32-P9-NEXT: lwz r4, 0(r4) ; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r4 +; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 0 +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-32-P9-NEXT: xxinsertw v3, vs0, 0 ; CHECK-AIX-32-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -640,9 +656,11 @@ define <16 x i8> @test_v16i8_v4i32(i8 %arg, i32 %arg1, <16 x i8> %a, <4 x i32> % ; CHECK-AIX-32-P9-LABEL: test_v16i8_v4i32: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: stb r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: stw r4, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v3, -32(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 0 +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r4 +; CHECK-AIX-32-P9-NEXT: xxinsertw v3, vs0, 0 ; CHECK-AIX-32-P9-NEXT: vmrghb v2, v2, v3 ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -721,9 +739,11 @@ define <16 x i8> @test_v4i32_v16i8(i32 %arg, i8 %arg1) { ; CHECK-AIX-32-P9-LABEL: test_v4i32_v16i8: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: stb r4, -32(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v2, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv v3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lwz r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r4 +; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 0 +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-32-P9-NEXT: xxinsertw v3, vs0, 0 ; CHECK-AIX-32-P9-NEXT: vmrghh v2, v3, v2 ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -984,9 +1004,11 @@ define <16 x i8> @test_v16i8_v2i64(i8 %arg, i64 %arg1, <16 x i8> %a, <2 x i64> % ; CHECK-AIX-32-P9-LABEL: test_v16i8_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: stb r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: stw r4, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v3, -32(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 0 +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r4 +; CHECK-AIX-32-P9-NEXT: xxinsertw v3, vs0, 0 ; CHECK-AIX-32-P9-NEXT: vmrghb v2, v2, v3 ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -1059,9 +1081,11 @@ define <16 x i8> @test_v2i64_v16i8(i64 %arg, i8 %arg1) { ; CHECK-AIX-32-P9-LABEL: test_v2i64_v16i8: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: stb r5, -32(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v2, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv v3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lwz r4, -32(r1) +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r4 +; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 0 +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-32-P9-NEXT: xxinsertw v3, vs0, 0 ; CHECK-AIX-32-P9-NEXT: vmrghh v2, v3, v2 ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -1413,8 +1437,12 @@ define <16 x i8> @test_v8i16_v8i16rhs(i16 %arg, i16 %arg1) { ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: sth r4, -16(r1) ; CHECK-AIX-32-P9-NEXT: sth r3, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v3, -32(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, -32(r1) +; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 0 +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-32-P9-NEXT: xxinsertw v3, vs0, 0 ; CHECK-AIX-32-P9-NEXT: vmrghh v2, v3, v2 ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -1493,9 +1521,11 @@ define <16 x i8> @test_v8i16_v4i32(<8 x i16> %a, <4 x i32> %b, i16 %arg, i32 %ar ; CHECK-AIX-32-P9-LABEL: test_v8i16_v4i32: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: stw r4, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v3, -32(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 0 +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r4 +; CHECK-AIX-32-P9-NEXT: xxinsertw v3, vs0, 0 ; CHECK-AIX-32-P9-NEXT: vmrghb v2, v2, v3 ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -1568,9 +1598,11 @@ define <16 x i8> @test_v8i16_v2i64(<8 x i16> %a, <2 x i64> %b, i16 %arg, i64 %ar ; CHECK-AIX-32-P9-LABEL: test_v8i16_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: stw r4, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v3, -32(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 0 +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r4 +; CHECK-AIX-32-P9-NEXT: xxinsertw v3, vs0, 0 ; CHECK-AIX-32-P9-NEXT: vmrghb v2, v2, v3 ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -1638,10 +1670,10 @@ define <16 x i8> @test_v4i32_v4i32(i32 %arg, i32 %arg1, <4 x i32> %a, <4 x i32> ; ; CHECK-AIX-32-P9-LABEL: test_v4i32_v4i32: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: stw r4, -16(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r4 +; CHECK-AIX-32-P9-NEXT: mtfprwz f1, r3 +; CHECK-AIX-32-P9-NEXT: xxinsertw vs0, vs0, 0 +; CHECK-AIX-32-P9-NEXT: xxinsertw vs1, vs1, 0 ; CHECK-AIX-32-P9-NEXT: xxmrghw v2, vs1, vs0 ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -1719,10 +1751,12 @@ define <16 x i8> @test_v4i32_v8i16(i32 %arg, i16 %arg1) { ; ; CHECK-AIX-32-P9-LABEL: test_v4i32_v8i16: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) ; CHECK-AIX-32-P9-NEXT: sth r4, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v3, -32(r1) +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, -32(r1) +; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 0 +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-32-P9-NEXT: xxinsertw v3, vs0, 0 ; CHECK-AIX-32-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -1866,10 +1900,10 @@ define <16 x i8> @test_v2i64_v4i32(i64 %arg, i32 %arg1, <2 x i64> %a, <4 x i32> ; ; CHECK-AIX-32-P9-LABEL: test_v2i64_v4i32: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: stw r5, -16(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r5 +; CHECK-AIX-32-P9-NEXT: mtfprwz f1, r3 +; CHECK-AIX-32-P9-NEXT: xxinsertw vs0, vs0, 0 +; CHECK-AIX-32-P9-NEXT: xxinsertw vs1, vs1, 0 ; CHECK-AIX-32-P9-NEXT: xxmrghw v2, vs1, vs0 ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -1941,10 +1975,12 @@ define <16 x i8> @test_v2i64_v8i16(i64 %arg, i16 %arg1) { ; ; CHECK-AIX-32-P9-LABEL: test_v2i64_v8i16: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) ; CHECK-AIX-32-P9-NEXT: sth r5, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v3, -32(r1) +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, -32(r1) +; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs0, 0 +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-32-P9-NEXT: xxinsertw v3, vs0, 0 ; CHECK-AIX-32-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-AIX-32-P9-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll index fcfcda586694d..e7596e8cb7888 100644 --- a/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll +++ b/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll @@ -99,15 +99,14 @@ entry: define <2 x i64> @test_none_v16i8(i8 %arg1, ptr nocapture noundef readonly %b) { ; CHECK-LE-P8-LABEL: test_none_v16i8: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: xxswapd v2, vs0 ; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: lxvdsx v2, 0, r4 ; CHECK-LE-P8-NEXT: xxpermdi v2, vs0, v2, 1 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_none_v16i8: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lxv v2, 0(r4) +; CHECK-LE-P9-NEXT: lxvdsx v2, 0, r4 ; CHECK-LE-P9-NEXT: mtfprd f0, r3 ; CHECK-LE-P9-NEXT: xxpermdi v2, vs0, v2, 1 ; CHECK-LE-P9-NEXT: blr @@ -115,13 +114,13 @@ define <2 x i64> @test_none_v16i8(i8 %arg1, ptr nocapture noundef readonly %b) { ; CHECK-BE-P8-LABEL: test_none_v16i8: ; CHECK-BE-P8: # %bb.0: # %entry ; CHECK-BE-P8-NEXT: mtfprwz f0, r3 -; CHECK-BE-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-BE-P8-NEXT: lxvdsx v2, 0, r4 ; CHECK-BE-P8-NEXT: xxpermdi v2, v2, vs0, 1 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_none_v16i8: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lxv v2, 0(r4) +; CHECK-BE-P9-NEXT: lxvdsx v2, 0, r4 ; CHECK-BE-P9-NEXT: mtfprwz f0, r3 ; CHECK-BE-P9-NEXT: xxpermdi v2, v2, vs0, 1 ; CHECK-BE-P9-NEXT: blr @@ -129,13 +128,13 @@ define <2 x i64> @test_none_v16i8(i8 %arg1, ptr nocapture noundef readonly %b) { ; CHECK-AIX-64-P8-LABEL: test_none_v16i8: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 -; CHECK-AIX-64-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-AIX-64-P8-NEXT: lxvdsx v2, 0, r4 ; CHECK-AIX-64-P8-NEXT: xxpermdi v2, v2, vs0, 1 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_none_v16i8: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-64-P9-NEXT: lxvdsx v2, 0, r4 ; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 ; CHECK-AIX-64-P9-NEXT: xxpermdi v2, v2, vs0, 1 ; CHECK-AIX-64-P9-NEXT: blr @@ -144,7 +143,8 @@ define <2 x i64> @test_none_v16i8(i8 %arg1, ptr nocapture noundef readonly %b) { ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: stb r3, -16(r1) ; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-AIX-32-P8-NEXT: lfd f0, 0(r4) +; CHECK-AIX-32-P8-NEXT: xxpermdi v2, vs0, vs0, 1 ; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 ; CHECK-AIX-32-P8-NEXT: xxmrghd v2, v2, vs0 ; CHECK-AIX-32-P8-NEXT: blr @@ -153,7 +153,8 @@ define <2 x i64> @test_none_v16i8(i8 %arg1, ptr nocapture noundef readonly %b) { ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: stb r3, -16(r1) ; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-32-P9-NEXT: lfd f1, 0(r4) +; CHECK-AIX-32-P9-NEXT: xxpermdi v2, vs1, vs0, 1 ; CHECK-AIX-32-P9-NEXT: xxmrghd v2, v2, vs0 ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -167,15 +168,14 @@ entry: define <2 x i64> @test_v16i8_none(i8 %arg1, ptr nocapture noundef readonly %b) { ; CHECK-LE-P8-LABEL: test_v16i8_none: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: xxswapd v2, vs0 ; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: lxvdsx v2, 0, r4 ; CHECK-LE-P8-NEXT: xxpermdi v2, v2, vs0, 2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v16i8_none: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lxv v2, 0(r4) +; CHECK-LE-P9-NEXT: lxvdsx v2, 0, r4 ; CHECK-LE-P9-NEXT: mtfprd f0, r3 ; CHECK-LE-P9-NEXT: xxpermdi v2, v2, vs0, 2 ; CHECK-LE-P9-NEXT: blr @@ -183,13 +183,13 @@ define <2 x i64> @test_v16i8_none(i8 %arg1, ptr nocapture noundef readonly %b) { ; CHECK-BE-P8-LABEL: test_v16i8_none: ; CHECK-BE-P8: # %bb.0: # %entry ; CHECK-BE-P8-NEXT: mtfprwz f0, r3 -; CHECK-BE-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-BE-P8-NEXT: lxvdsx v2, 0, r4 ; CHECK-BE-P8-NEXT: xxpermdi v2, vs0, v2, 2 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v16i8_none: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lxv v2, 0(r4) +; CHECK-BE-P9-NEXT: lxvdsx v2, 0, r4 ; CHECK-BE-P9-NEXT: mtfprwz f0, r3 ; CHECK-BE-P9-NEXT: xxpermdi v2, vs0, v2, 2 ; CHECK-BE-P9-NEXT: blr @@ -197,13 +197,13 @@ define <2 x i64> @test_v16i8_none(i8 %arg1, ptr nocapture noundef readonly %b) { ; CHECK-AIX-64-P8-LABEL: test_v16i8_none: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 -; CHECK-AIX-64-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-AIX-64-P8-NEXT: lxvdsx v2, 0, r4 ; CHECK-AIX-64-P8-NEXT: xxpermdi v2, vs0, v2, 2 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v16i8_none: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-64-P9-NEXT: lxvdsx v2, 0, r4 ; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 ; CHECK-AIX-64-P9-NEXT: xxpermdi v2, vs0, v2, 2 ; CHECK-AIX-64-P9-NEXT: blr @@ -212,7 +212,8 @@ define <2 x i64> @test_v16i8_none(i8 %arg1, ptr nocapture noundef readonly %b) { ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: stb r3, -16(r1) ; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-AIX-32-P8-NEXT: lfd f0, 0(r4) +; CHECK-AIX-32-P8-NEXT: xxpermdi v2, vs0, vs0, 1 ; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 ; CHECK-AIX-32-P8-NEXT: xxmrghd v2, vs0, v2 ; CHECK-AIX-32-P8-NEXT: blr @@ -221,7 +222,8 @@ define <2 x i64> @test_v16i8_none(i8 %arg1, ptr nocapture noundef readonly %b) { ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: stb r3, -16(r1) ; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-32-P9-NEXT: lfd f1, 0(r4) +; CHECK-AIX-32-P9-NEXT: xxpermdi v2, vs1, vs0, 1 ; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs0, v2 ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -377,15 +379,14 @@ entry: define <2 x i64> @test_v8i16_none(i16 %arg1, ptr nocapture noundef readonly %b) { ; CHECK-LE-P8-LABEL: test_v8i16_none: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: xxswapd v2, vs0 ; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: lxvdsx v2, 0, r4 ; CHECK-LE-P8-NEXT: xxpermdi v2, v2, vs0, 2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v8i16_none: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lxv v2, 0(r4) +; CHECK-LE-P9-NEXT: lxvdsx v2, 0, r4 ; CHECK-LE-P9-NEXT: mtfprd f0, r3 ; CHECK-LE-P9-NEXT: xxpermdi v2, v2, vs0, 2 ; CHECK-LE-P9-NEXT: blr @@ -393,13 +394,13 @@ define <2 x i64> @test_v8i16_none(i16 %arg1, ptr nocapture noundef readonly %b) ; CHECK-BE-P8-LABEL: test_v8i16_none: ; CHECK-BE-P8: # %bb.0: # %entry ; CHECK-BE-P8-NEXT: mtfprwz f0, r3 -; CHECK-BE-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-BE-P8-NEXT: lxvdsx v2, 0, r4 ; CHECK-BE-P8-NEXT: xxpermdi v2, vs0, v2, 2 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v8i16_none: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lxv v2, 0(r4) +; CHECK-BE-P9-NEXT: lxvdsx v2, 0, r4 ; CHECK-BE-P9-NEXT: mtfprwz f0, r3 ; CHECK-BE-P9-NEXT: xxpermdi v2, vs0, v2, 2 ; CHECK-BE-P9-NEXT: blr @@ -407,13 +408,13 @@ define <2 x i64> @test_v8i16_none(i16 %arg1, ptr nocapture noundef readonly %b) ; CHECK-AIX-64-P8-LABEL: test_v8i16_none: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 -; CHECK-AIX-64-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-AIX-64-P8-NEXT: lxvdsx v2, 0, r4 ; CHECK-AIX-64-P8-NEXT: xxpermdi v2, vs0, v2, 2 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v8i16_none: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-64-P9-NEXT: lxvdsx v2, 0, r4 ; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 ; CHECK-AIX-64-P9-NEXT: xxpermdi v2, vs0, v2, 2 ; CHECK-AIX-64-P9-NEXT: blr @@ -422,7 +423,8 @@ define <2 x i64> @test_v8i16_none(i16 %arg1, ptr nocapture noundef readonly %b) ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: sth r3, -16(r1) ; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-AIX-32-P8-NEXT: lfd f0, 0(r4) +; CHECK-AIX-32-P8-NEXT: xxpermdi v2, vs0, vs0, 1 ; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 ; CHECK-AIX-32-P8-NEXT: xxmrghd v2, vs0, v2 ; CHECK-AIX-32-P8-NEXT: blr @@ -431,7 +433,8 @@ define <2 x i64> @test_v8i16_none(i16 %arg1, ptr nocapture noundef readonly %b) ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) ; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-32-P9-NEXT: lfd f1, 0(r4) +; CHECK-AIX-32-P9-NEXT: xxpermdi v2, vs1, vs0, 1 ; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs0, v2 ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -445,15 +448,14 @@ entry: define <2 x i64> @test_none_v8i16(i16 %arg1, ptr nocapture noundef readonly %b) { ; CHECK-LE-P8-LABEL: test_none_v8i16: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: xxswapd v2, vs0 ; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: lxvdsx v2, 0, r4 ; CHECK-LE-P8-NEXT: xxpermdi v2, vs0, v2, 1 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_none_v8i16: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lxv v2, 0(r4) +; CHECK-LE-P9-NEXT: lxvdsx v2, 0, r4 ; CHECK-LE-P9-NEXT: mtfprd f0, r3 ; CHECK-LE-P9-NEXT: xxpermdi v2, vs0, v2, 1 ; CHECK-LE-P9-NEXT: blr @@ -461,13 +463,13 @@ define <2 x i64> @test_none_v8i16(i16 %arg1, ptr nocapture noundef readonly %b) ; CHECK-BE-P8-LABEL: test_none_v8i16: ; CHECK-BE-P8: # %bb.0: # %entry ; CHECK-BE-P8-NEXT: mtfprwz f0, r3 -; CHECK-BE-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-BE-P8-NEXT: lxvdsx v2, 0, r4 ; CHECK-BE-P8-NEXT: xxpermdi v2, v2, vs0, 1 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_none_v8i16: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lxv v2, 0(r4) +; CHECK-BE-P9-NEXT: lxvdsx v2, 0, r4 ; CHECK-BE-P9-NEXT: mtfprwz f0, r3 ; CHECK-BE-P9-NEXT: xxpermdi v2, v2, vs0, 1 ; CHECK-BE-P9-NEXT: blr @@ -475,13 +477,13 @@ define <2 x i64> @test_none_v8i16(i16 %arg1, ptr nocapture noundef readonly %b) ; CHECK-AIX-64-P8-LABEL: test_none_v8i16: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 -; CHECK-AIX-64-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-AIX-64-P8-NEXT: lxvdsx v2, 0, r4 ; CHECK-AIX-64-P8-NEXT: xxpermdi v2, v2, vs0, 1 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_none_v8i16: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-64-P9-NEXT: lxvdsx v2, 0, r4 ; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 ; CHECK-AIX-64-P9-NEXT: xxpermdi v2, v2, vs0, 1 ; CHECK-AIX-64-P9-NEXT: blr @@ -490,7 +492,8 @@ define <2 x i64> @test_none_v8i16(i16 %arg1, ptr nocapture noundef readonly %b) ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: sth r3, -16(r1) ; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-AIX-32-P8-NEXT: lfd f0, 0(r4) +; CHECK-AIX-32-P8-NEXT: xxpermdi v2, vs0, vs0, 1 ; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 ; CHECK-AIX-32-P8-NEXT: xxmrghd v2, v2, vs0 ; CHECK-AIX-32-P8-NEXT: blr @@ -499,7 +502,8 @@ define <2 x i64> @test_none_v8i16(i16 %arg1, ptr nocapture noundef readonly %b) ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) ; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-32-P9-NEXT: lfd f1, 0(r4) +; CHECK-AIX-32-P9-NEXT: xxpermdi v2, vs1, vs0, 1 ; CHECK-AIX-32-P9-NEXT: xxmrghd v2, v2, vs0 ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -655,15 +659,14 @@ entry: define <2 x i64> @test_none_v4i32(i32 %arg1, ptr nocapture noundef readonly %b) { ; CHECK-LE-P8-LABEL: test_none_v4i32: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: xxswapd v2, vs0 ; CHECK-LE-P8-NEXT: mtfprwz f0, r3 +; CHECK-LE-P8-NEXT: lxvdsx v2, 0, r4 ; CHECK-LE-P8-NEXT: xxpermdi v2, vs0, v2, 1 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_none_v4i32: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lxv v2, 0(r4) +; CHECK-LE-P9-NEXT: lxvdsx v2, 0, r4 ; CHECK-LE-P9-NEXT: mtfprwz f0, r3 ; CHECK-LE-P9-NEXT: xxpermdi v2, vs0, v2, 1 ; CHECK-LE-P9-NEXT: blr @@ -671,13 +674,13 @@ define <2 x i64> @test_none_v4i32(i32 %arg1, ptr nocapture noundef readonly %b) ; CHECK-BE-P8-LABEL: test_none_v4i32: ; CHECK-BE-P8: # %bb.0: # %entry ; CHECK-BE-P8-NEXT: mtfprwz f0, r3 -; CHECK-BE-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-BE-P8-NEXT: lxvdsx v2, 0, r4 ; CHECK-BE-P8-NEXT: xxpermdi v2, v2, vs0, 1 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_none_v4i32: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lxv v2, 0(r4) +; CHECK-BE-P9-NEXT: lxvdsx v2, 0, r4 ; CHECK-BE-P9-NEXT: mtfprwz f0, r3 ; CHECK-BE-P9-NEXT: xxpermdi v2, v2, vs0, 1 ; CHECK-BE-P9-NEXT: blr @@ -685,13 +688,13 @@ define <2 x i64> @test_none_v4i32(i32 %arg1, ptr nocapture noundef readonly %b) ; CHECK-AIX-64-P8-LABEL: test_none_v4i32: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 -; CHECK-AIX-64-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-AIX-64-P8-NEXT: lxvdsx v2, 0, r4 ; CHECK-AIX-64-P8-NEXT: xxpermdi v2, v2, vs0, 1 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_none_v4i32: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-64-P9-NEXT: lxvdsx v2, 0, r4 ; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 ; CHECK-AIX-64-P9-NEXT: xxpermdi v2, v2, vs0, 1 ; CHECK-AIX-64-P9-NEXT: blr @@ -700,7 +703,8 @@ define <2 x i64> @test_none_v4i32(i32 %arg1, ptr nocapture noundef readonly %b) ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) ; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-AIX-32-P8-NEXT: lfd f0, 0(r4) +; CHECK-AIX-32-P8-NEXT: xxpermdi v2, vs0, vs0, 1 ; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 ; CHECK-AIX-32-P8-NEXT: xxmrghd v2, v2, vs0 ; CHECK-AIX-32-P8-NEXT: blr @@ -709,7 +713,8 @@ define <2 x i64> @test_none_v4i32(i32 %arg1, ptr nocapture noundef readonly %b) ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) ; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-32-P9-NEXT: lfd f1, 0(r4) +; CHECK-AIX-32-P9-NEXT: xxpermdi v2, vs1, vs0, 1 ; CHECK-AIX-32-P9-NEXT: xxmrghd v2, v2, vs0 ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -723,15 +728,14 @@ entry: define <2 x i64> @test_v4i32_none(i32 %arg1, ptr nocapture noundef readonly %b) { ; CHECK-LE-P8-LABEL: test_v4i32_none: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: xxswapd v2, vs0 ; CHECK-LE-P8-NEXT: mtfprwz f0, r3 +; CHECK-LE-P8-NEXT: lxvdsx v2, 0, r4 ; CHECK-LE-P8-NEXT: xxpermdi v2, v2, vs0, 2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v4i32_none: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lxv v2, 0(r4) +; CHECK-LE-P9-NEXT: lxvdsx v2, 0, r4 ; CHECK-LE-P9-NEXT: mtfprwz f0, r3 ; CHECK-LE-P9-NEXT: xxpermdi v2, v2, vs0, 2 ; CHECK-LE-P9-NEXT: blr @@ -739,13 +743,13 @@ define <2 x i64> @test_v4i32_none(i32 %arg1, ptr nocapture noundef readonly %b) ; CHECK-BE-P8-LABEL: test_v4i32_none: ; CHECK-BE-P8: # %bb.0: # %entry ; CHECK-BE-P8-NEXT: mtfprwz f0, r3 -; CHECK-BE-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-BE-P8-NEXT: lxvdsx v2, 0, r4 ; CHECK-BE-P8-NEXT: xxpermdi v2, vs0, v2, 2 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v4i32_none: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lxv v2, 0(r4) +; CHECK-BE-P9-NEXT: lxvdsx v2, 0, r4 ; CHECK-BE-P9-NEXT: mtfprwz f0, r3 ; CHECK-BE-P9-NEXT: xxpermdi v2, vs0, v2, 2 ; CHECK-BE-P9-NEXT: blr @@ -753,13 +757,13 @@ define <2 x i64> @test_v4i32_none(i32 %arg1, ptr nocapture noundef readonly %b) ; CHECK-AIX-64-P8-LABEL: test_v4i32_none: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 -; CHECK-AIX-64-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-AIX-64-P8-NEXT: lxvdsx v2, 0, r4 ; CHECK-AIX-64-P8-NEXT: xxpermdi v2, vs0, v2, 2 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v4i32_none: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-64-P9-NEXT: lxvdsx v2, 0, r4 ; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 ; CHECK-AIX-64-P9-NEXT: xxpermdi v2, vs0, v2, 2 ; CHECK-AIX-64-P9-NEXT: blr @@ -768,7 +772,8 @@ define <2 x i64> @test_v4i32_none(i32 %arg1, ptr nocapture noundef readonly %b) ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: stw r3, -16(r1) ; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvd2x v2, 0, r4 +; CHECK-AIX-32-P8-NEXT: lfd f0, 0(r4) +; CHECK-AIX-32-P8-NEXT: xxpermdi v2, vs0, vs0, 1 ; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 ; CHECK-AIX-32-P8-NEXT: xxmrghd v2, vs0, v2 ; CHECK-AIX-32-P8-NEXT: blr @@ -777,7 +782,8 @@ define <2 x i64> @test_v4i32_none(i32 %arg1, ptr nocapture noundef readonly %b) ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) ; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v2, 0(r4) +; CHECK-AIX-32-P9-NEXT: lfd f1, 0(r4) +; CHECK-AIX-32-P9-NEXT: xxpermdi v2, vs1, vs0, 1 ; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs0, v2 ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -934,12 +940,12 @@ define <2 x i64> @test_v2i64_v16i8(i8 %arg1, i64 %arg) { ; ; CHECK-AIX-32-P9-LABEL: test_v2i64_v16i8: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: stw r5, -32(r1) -; CHECK-AIX-32-P9-NEXT: stw r4, -48(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs2, -48(r1) +; CHECK-AIX-32-P9-NEXT: mtfprwz f1, r5 +; CHECK-AIX-32-P9-NEXT: mtfprwz f2, r4 ; CHECK-AIX-32-P9-NEXT: stb r3, -16(r1) ; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: xxinsertw vs1, vs1, 0 +; CHECK-AIX-32-P9-NEXT: xxinsertw vs2, vs2, 0 ; CHECK-AIX-32-P9-NEXT: xxmrghw vs1, vs2, vs1 ; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-AIX-32-P9-NEXT: blr @@ -955,43 +961,48 @@ entry: define <2 x i64> @test_none_v2i64(ptr nocapture noundef readonly %b, i64 %arg) { ; CHECK-LE-P8-LABEL: test_none_v2i64: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r3 -; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: lfd f0, 0(r3) +; CHECK-LE-P8-NEXT: xxspltd v2, vs0, 0 ; CHECK-LE-P8-NEXT: mtfprd f0, r4 ; CHECK-LE-P8-NEXT: xxpermdi v2, vs0, v2, 1 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_none_v2i64: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lxv v2, 0(r3) +; CHECK-LE-P9-NEXT: lfd f0, 0(r3) +; CHECK-LE-P9-NEXT: xxspltd v2, vs0, 0 ; CHECK-LE-P9-NEXT: mtfprd f0, r4 ; CHECK-LE-P9-NEXT: xxpermdi v2, vs0, v2, 1 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_none_v2i64: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: lxvd2x v2, 0, r3 +; CHECK-BE-P8-NEXT: lfd f0, 0(r3) +; CHECK-BE-P8-NEXT: xxpermdi v2, vs0, vs0, 1 ; CHECK-BE-P8-NEXT: mtfprd f0, r4 ; CHECK-BE-P8-NEXT: xxmrghd v2, v2, vs0 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_none_v2i64: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lxv v2, 0(r3) +; CHECK-BE-P9-NEXT: lfd f0, 0(r3) +; CHECK-BE-P9-NEXT: xxpermdi v2, vs0, vs0, 1 ; CHECK-BE-P9-NEXT: mtfprd f0, r4 ; CHECK-BE-P9-NEXT: xxmrghd v2, v2, vs0 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_none_v2i64: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: lxvd2x v2, 0, r3 +; CHECK-AIX-64-P8-NEXT: lfd f0, 0(r3) +; CHECK-AIX-64-P8-NEXT: xxpermdi v2, vs0, vs0, 1 ; CHECK-AIX-64-P8-NEXT: mtfprd f0, r4 ; CHECK-AIX-64-P8-NEXT: xxmrghd v2, v2, vs0 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_none_v2i64: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lxv v2, 0(r3) +; CHECK-AIX-64-P9-NEXT: lfd f0, 0(r3) +; CHECK-AIX-64-P9-NEXT: xxpermdi v2, vs0, vs0, 1 ; CHECK-AIX-64-P9-NEXT: mtfprd f0, r4 ; CHECK-AIX-64-P9-NEXT: xxmrghd v2, v2, vs0 ; CHECK-AIX-64-P9-NEXT: blr @@ -1031,55 +1042,53 @@ entry: define <2 x i64> @test_v2i64_none(ptr nocapture noundef readonly %b, i64 %arg) { ; CHECK-LE-P8-LABEL: test_v2i64_none: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r3 -; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: ld r3, 0(r3) ; CHECK-LE-P8-NEXT: mtfprd f0, r4 -; CHECK-LE-P8-NEXT: xxpermdi v2, v2, vs0, 2 +; CHECK-LE-P8-NEXT: mtfprd f1, r3 +; CHECK-LE-P8-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v2i64_none: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lxv v2, 0(r3) -; CHECK-LE-P9-NEXT: mtfprd f0, r4 -; CHECK-LE-P9-NEXT: xxpermdi v2, v2, vs0, 2 +; CHECK-LE-P9-NEXT: ld r3, 0(r3) +; CHECK-LE-P9-NEXT: mtvsrdd v2, r3, r4 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v2i64_none: ; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: ld r3, 0(r3) ; CHECK-BE-P8-NEXT: mtfprd f0, r4 -; CHECK-BE-P8-NEXT: lxvd2x v3, 0, r3 -; CHECK-BE-P8-NEXT: xxspltd v2, vs0, 0 -; CHECK-BE-P8-NEXT: xxmrghd v2, v2, v3 +; CHECK-BE-P8-NEXT: mtfprd f1, r3 +; CHECK-BE-P8-NEXT: xxmrghd v2, vs0, vs1 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v2i64_none: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lxv v2, 0(r3) -; CHECK-BE-P9-NEXT: mtvsrdd v3, r4, r4 -; CHECK-BE-P9-NEXT: xxmrghd v2, v3, v2 +; CHECK-BE-P9-NEXT: ld r3, 0(r3) +; CHECK-BE-P9-NEXT: mtvsrdd v2, r4, r3 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v2i64_none: ; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: ld r3, 0(r3) ; CHECK-AIX-64-P8-NEXT: mtfprd f0, r4 -; CHECK-AIX-64-P8-NEXT: lxvd2x v3, 0, r3 -; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, vs0 -; CHECK-AIX-64-P8-NEXT: xxmrghd v2, v2, v3 +; CHECK-AIX-64-P8-NEXT: mtfprd f1, r3 +; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, vs1 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v2i64_none: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lxv v2, 0(r3) -; CHECK-AIX-64-P9-NEXT: mtvsrdd v3, r4, r4 -; CHECK-AIX-64-P9-NEXT: xxmrghd v2, v3, v2 +; CHECK-AIX-64-P9-NEXT: ld r3, 0(r3) +; CHECK-AIX-64-P9-NEXT: mtvsrdd v2, r4, r3 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v2i64_none: ; CHECK-AIX-32-P8: # %bb.0: # %entry -; CHECK-AIX-32-P8-NEXT: lxvd2x v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: lfd f0, 0(r3) ; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 ; CHECK-AIX-32-P8-NEXT: stw r5, -16(r1) ; CHECK-AIX-32-P8-NEXT: stw r4, -32(r1) +; CHECK-AIX-32-P8-NEXT: xxpermdi v2, vs0, vs0, 1 ; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 ; CHECK-AIX-32-P8-NEXT: addi r3, r1, -32 ; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 @@ -1089,11 +1098,12 @@ define <2 x i64> @test_v2i64_none(ptr nocapture noundef readonly %b, i64 %arg) { ; ; CHECK-AIX-32-P9-LABEL: test_v2i64_none: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lxv v2, 0(r3) -; CHECK-AIX-32-P9-NEXT: stw r5, -16(r1) -; CHECK-AIX-32-P9-NEXT: stw r4, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: lfd f0, 0(r3) +; CHECK-AIX-32-P9-NEXT: mtfprwz f1, r4 +; CHECK-AIX-32-P9-NEXT: xxinsertw vs1, vs1, 0 +; CHECK-AIX-32-P9-NEXT: xxpermdi v2, vs0, vs0, 1 +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r5 +; CHECK-AIX-32-P9-NEXT: xxinsertw vs0, vs0, 0 ; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs1, vs0 ; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs0, v2 ; CHECK-AIX-32-P9-NEXT: blr @@ -1536,13 +1546,13 @@ define <2 x i64> @test_v4i32_v2i64(i32 %arg1, i64 %arg) { ; ; CHECK-AIX-32-P9-LABEL: test_v4i32_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: stw r4, -32(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -48(r1) +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r4 +; CHECK-AIX-32-P9-NEXT: mtfprwz f1, r3 ; CHECK-AIX-32-P9-NEXT: lwz r3, L..C0(r2) # %const.0 -; CHECK-AIX-32-P9-NEXT: lxv vs0, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs1, -48(r1) ; CHECK-AIX-32-P9-NEXT: stw r5, -16(r1) ; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) +; CHECK-AIX-32-P9-NEXT: xxinsertw vs0, vs0, 0 +; CHECK-AIX-32-P9-NEXT: xxinsertw vs1, vs1, 0 ; CHECK-AIX-32-P9-NEXT: xxmrghw v3, vs1, vs0 ; CHECK-AIX-32-P9-NEXT: lxv vs0, 0(r3) ; CHECK-AIX-32-P9-NEXT: xxperm v2, v3, vs0 @@ -1713,12 +1723,12 @@ define <2 x i64> @test_v2i64_v4i32(i64 %arg1, i32 %arg) { ; ; CHECK-AIX-32-P9-LABEL: test_v2i64_v4i32: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: stw r4, -16(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs1, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs2, -32(r1) +; CHECK-AIX-32-P9-NEXT: mtfprwz f1, r4 +; CHECK-AIX-32-P9-NEXT: mtfprwz f2, r3 ; CHECK-AIX-32-P9-NEXT: stw r5, -48(r1) ; CHECK-AIX-32-P9-NEXT: lxv vs0, -48(r1) +; CHECK-AIX-32-P9-NEXT: xxinsertw vs1, vs1, 0 +; CHECK-AIX-32-P9-NEXT: xxinsertw vs2, vs2, 0 ; CHECK-AIX-32-P9-NEXT: xxmrghw vs1, vs2, vs1 ; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-AIX-32-P9-NEXT: blr @@ -1793,12 +1803,12 @@ define <2 x i64> @test_v2i64_v8i16(i64 %arg1, i16 %arg) { ; ; CHECK-AIX-32-P9-LABEL: test_v2i64_v8i16: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: stw r4, -16(r1) -; CHECK-AIX-32-P9-NEXT: stw r3, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs1, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs2, -32(r1) +; CHECK-AIX-32-P9-NEXT: mtfprwz f1, r4 +; CHECK-AIX-32-P9-NEXT: mtfprwz f2, r3 ; CHECK-AIX-32-P9-NEXT: sth r5, -48(r1) ; CHECK-AIX-32-P9-NEXT: lxv vs0, -48(r1) +; CHECK-AIX-32-P9-NEXT: xxinsertw vs1, vs1, 0 +; CHECK-AIX-32-P9-NEXT: xxinsertw vs2, vs2, 0 ; CHECK-AIX-32-P9-NEXT: xxmrghw vs1, vs2, vs1 ; CHECK-AIX-32-P9-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-AIX-32-P9-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll index 47ffdb4625ed3..a4aa8eac2033d 100644 --- a/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll +++ b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll @@ -339,9 +339,11 @@ define void @test_none_v4i32(ptr %ptr, ptr %ptr2, i8 %v3) local_unnamed_addr #0 ; CHECK-AIX-32-P9-LABEL: test_none_v4i32: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r3 -; CHECK-AIX-32-P9-NEXT: lwz r3, L..C2(r2) # %const.0 ; CHECK-AIX-32-P9-NEXT: stb r5, -32(r1) -; CHECK-AIX-32-P9-NEXT: lxv v3, -32(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, -32(r1) +; CHECK-AIX-32-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C2(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: xxinsertw v3, vs0, 0 ; CHECK-AIX-32-P9-NEXT: lxv v4, 0(r3) ; CHECK-AIX-32-P9-NEXT: vmrghh v3, v3, v3 ; CHECK-AIX-32-P9-NEXT: vperm v2, v2, v3, v4 diff --git a/llvm/test/CodeGen/PowerPC/vsx_shuffle_le.ll b/llvm/test/CodeGen/PowerPC/vsx_shuffle_le.ll index 66c1b6f6d26da..cc2fe5604a371 100644 --- a/llvm/test/CodeGen/PowerPC/vsx_shuffle_le.ll +++ b/llvm/test/CodeGen/PowerPC/vsx_shuffle_le.ll @@ -45,18 +45,16 @@ define <2 x double> @test01(ptr %p1, ptr %p2) { define <2 x double> @test02(ptr %p1, ptr %p2) { ; CHECK-LABEL: test02: ; CHECK: # %bb.0: -; CHECK-NEXT: lxvd2x 0, 0, 3 -; CHECK-NEXT: lxvd2x 1, 0, 4 -; CHECK-NEXT: xxswapd 0, 0 -; CHECK-NEXT: xxswapd 1, 1 -; CHECK-NEXT: xxmrgld 34, 1, 0 +; CHECK-NEXT: lfd 0, 0(3) +; CHECK-NEXT: lfd 1, 0(4) +; CHECK-NEXT: xxmrghd 34, 1, 0 ; CHECK-NEXT: blr ; ; CHECK-P9-LABEL: test02: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: lxv 0, 0(3) -; CHECK-P9-NEXT: lxv 1, 0(4) -; CHECK-P9-NEXT: xxmrgld 34, 1, 0 +; CHECK-P9-NEXT: lfd 0, 0(3) +; CHECK-P9-NEXT: lfd 1, 0(4) +; CHECK-P9-NEXT: xxmrghd 34, 1, 0 ; CHECK-P9-NEXT: blr %v1 = load <2 x double>, ptr %p1 %v2 = load <2 x double>, ptr %p2 @@ -67,18 +65,16 @@ define <2 x double> @test02(ptr %p1, ptr %p2) { define <2 x double> @test03(ptr %p1, ptr %p2) { ; CHECK-LABEL: test03: ; CHECK: # %bb.0: -; CHECK-NEXT: lxvd2x 0, 0, 3 -; CHECK-NEXT: lxvd2x 1, 0, 4 -; CHECK-NEXT: xxswapd 0, 0 -; CHECK-NEXT: xxswapd 1, 1 -; CHECK-NEXT: xxpermdi 34, 1, 0, 1 +; CHECK-NEXT: lfd 0, 0(3) +; CHECK-NEXT: lfd 1, 8(4) +; CHECK-NEXT: xxmrghd 34, 1, 0 ; CHECK-NEXT: blr ; ; CHECK-P9-LABEL: test03: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: lxv 0, 0(3) -; CHECK-P9-NEXT: lxv 1, 0(4) -; CHECK-P9-NEXT: xxpermdi 34, 1, 0, 1 +; CHECK-P9-NEXT: lfd 0, 0(3) +; CHECK-P9-NEXT: lfd 1, 8(4) +; CHECK-P9-NEXT: xxmrghd 34, 1, 0 ; CHECK-P9-NEXT: blr %v1 = load <2 x double>, ptr %p1 %v2 = load <2 x double>, ptr %p2 @@ -123,18 +119,16 @@ define <2 x double> @test11(ptr %p1, ptr %p2) { define <2 x double> @test12(ptr %p1, ptr %p2) { ; CHECK-LABEL: test12: ; CHECK: # %bb.0: -; CHECK-NEXT: lxvd2x 0, 0, 3 -; CHECK-NEXT: lxvd2x 1, 0, 4 -; CHECK-NEXT: xxswapd 0, 0 -; CHECK-NEXT: xxswapd 1, 1 -; CHECK-NEXT: xxpermdi 34, 1, 0, 2 +; CHECK-NEXT: lfd 0, 8(3) +; CHECK-NEXT: lfd 1, 0(4) +; CHECK-NEXT: xxmrghd 34, 1, 0 ; CHECK-NEXT: blr ; ; CHECK-P9-LABEL: test12: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: lxv 0, 0(3) -; CHECK-P9-NEXT: lxv 1, 0(4) -; CHECK-P9-NEXT: xxpermdi 34, 1, 0, 2 +; CHECK-P9-NEXT: lfd 0, 8(3) +; CHECK-P9-NEXT: lfd 1, 0(4) +; CHECK-P9-NEXT: xxmrghd 34, 1, 0 ; CHECK-P9-NEXT: blr %v1 = load <2 x double>, ptr %p1 %v2 = load <2 x double>, ptr %p2 @@ -145,17 +139,15 @@ define <2 x double> @test12(ptr %p1, ptr %p2) { define <2 x double> @test13(ptr %p1, ptr %p2) { ; CHECK-LABEL: test13: ; CHECK: # %bb.0: -; CHECK-NEXT: lxvd2x 0, 0, 3 -; CHECK-NEXT: lxvd2x 1, 0, 4 -; CHECK-NEXT: xxswapd 0, 0 -; CHECK-NEXT: xxswapd 1, 1 +; CHECK-NEXT: lfd 0, 8(3) +; CHECK-NEXT: lfd 1, 8(4) ; CHECK-NEXT: xxmrghd 34, 1, 0 ; CHECK-NEXT: blr ; ; CHECK-P9-LABEL: test13: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: lxv 0, 0(3) -; CHECK-P9-NEXT: lxv 1, 0(4) +; CHECK-P9-NEXT: lfd 0, 8(3) +; CHECK-P9-NEXT: lfd 1, 8(4) ; CHECK-P9-NEXT: xxmrghd 34, 1, 0 ; CHECK-P9-NEXT: blr %v1 = load <2 x double>, ptr %p1 @@ -167,18 +159,16 @@ define <2 x double> @test13(ptr %p1, ptr %p2) { define <2 x double> @test20(ptr %p1, ptr %p2) { ; CHECK-LABEL: test20: ; CHECK: # %bb.0: -; CHECK-NEXT: lxvd2x 0, 0, 3 -; CHECK-NEXT: lxvd2x 1, 0, 4 -; CHECK-NEXT: xxswapd 0, 0 -; CHECK-NEXT: xxswapd 1, 1 -; CHECK-NEXT: xxmrgld 34, 0, 1 +; CHECK-NEXT: lfd 0, 0(3) +; CHECK-NEXT: lfd 1, 0(4) +; CHECK-NEXT: xxmrghd 34, 0, 1 ; CHECK-NEXT: blr ; ; CHECK-P9-LABEL: test20: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: lxv 0, 0(3) -; CHECK-P9-NEXT: lxv 1, 0(4) -; CHECK-P9-NEXT: xxmrgld 34, 0, 1 +; CHECK-P9-NEXT: lfd 0, 0(3) +; CHECK-P9-NEXT: lfd 1, 0(4) +; CHECK-P9-NEXT: xxmrghd 34, 0, 1 ; CHECK-P9-NEXT: blr %v1 = load <2 x double>, ptr %p1 %v2 = load <2 x double>, ptr %p2 @@ -189,18 +179,16 @@ define <2 x double> @test20(ptr %p1, ptr %p2) { define <2 x double> @test21(ptr %p1, ptr %p2) { ; CHECK-LABEL: test21: ; CHECK: # %bb.0: -; CHECK-NEXT: lxvd2x 0, 0, 3 -; CHECK-NEXT: lxvd2x 1, 0, 4 -; CHECK-NEXT: xxswapd 0, 0 -; CHECK-NEXT: xxswapd 1, 1 -; CHECK-NEXT: xxpermdi 34, 0, 1, 1 +; CHECK-NEXT: lfd 0, 8(3) +; CHECK-NEXT: lfd 1, 0(4) +; CHECK-NEXT: xxmrghd 34, 0, 1 ; CHECK-NEXT: blr ; ; CHECK-P9-LABEL: test21: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: lxv 0, 0(3) -; CHECK-P9-NEXT: lxv 1, 0(4) -; CHECK-P9-NEXT: xxpermdi 34, 0, 1, 1 +; CHECK-P9-NEXT: lfd 0, 8(3) +; CHECK-P9-NEXT: lfd 1, 0(4) +; CHECK-P9-NEXT: xxmrghd 34, 0, 1 ; CHECK-P9-NEXT: blr %v1 = load <2 x double>, ptr %p1 %v2 = load <2 x double>, ptr %p2 @@ -244,18 +232,16 @@ define <2 x double> @test23(ptr %p1, ptr %p2) { define <2 x double> @test30(ptr %p1, ptr %p2) { ; CHECK-LABEL: test30: ; CHECK: # %bb.0: -; CHECK-NEXT: lxvd2x 0, 0, 3 -; CHECK-NEXT: lxvd2x 1, 0, 4 -; CHECK-NEXT: xxswapd 0, 0 -; CHECK-NEXT: xxswapd 1, 1 -; CHECK-NEXT: xxpermdi 34, 0, 1, 2 +; CHECK-NEXT: lfd 0, 0(3) +; CHECK-NEXT: lfd 1, 8(4) +; CHECK-NEXT: xxmrghd 34, 0, 1 ; CHECK-NEXT: blr ; ; CHECK-P9-LABEL: test30: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: lxv 0, 0(3) -; CHECK-P9-NEXT: lxv 1, 0(4) -; CHECK-P9-NEXT: xxpermdi 34, 0, 1, 2 +; CHECK-P9-NEXT: lfd 0, 0(3) +; CHECK-P9-NEXT: lfd 1, 8(4) +; CHECK-P9-NEXT: xxmrghd 34, 0, 1 ; CHECK-P9-NEXT: blr %v1 = load <2 x double>, ptr %p1 %v2 = load <2 x double>, ptr %p2 @@ -266,17 +252,15 @@ define <2 x double> @test30(ptr %p1, ptr %p2) { define <2 x double> @test31(ptr %p1, ptr %p2) { ; CHECK-LABEL: test31: ; CHECK: # %bb.0: -; CHECK-NEXT: lxvd2x 0, 0, 3 -; CHECK-NEXT: lxvd2x 1, 0, 4 -; CHECK-NEXT: xxswapd 0, 0 -; CHECK-NEXT: xxswapd 1, 1 +; CHECK-NEXT: lfd 0, 8(3) +; CHECK-NEXT: lfd 1, 8(4) ; CHECK-NEXT: xxmrghd 34, 0, 1 ; CHECK-NEXT: blr ; ; CHECK-P9-LABEL: test31: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: lxv 0, 0(3) -; CHECK-P9-NEXT: lxv 1, 0(4) +; CHECK-P9-NEXT: lfd 0, 8(3) +; CHECK-P9-NEXT: lfd 1, 8(4) ; CHECK-P9-NEXT: xxmrghd 34, 0, 1 ; CHECK-P9-NEXT: blr %v1 = load <2 x double>, ptr %p1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll index 41d8abb9b73eb..b1735ec832d9b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN define <4 x bfloat> @shuffle_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) { ; CHECK-LABEL: shuffle_v4bf16: @@ -385,12 +385,19 @@ define <4 x half> @vrgather_shuffle_vv_v4f16(<4 x half> %x, <4 x half> %y) { } define <4 x half> @vrgather_shuffle_vx_v4f16_load(ptr %p) { -; CHECK-LABEL: vrgather_shuffle_vx_v4f16_load: -; CHECK: # %bb.0: -; CHECK-NEXT: lh a0, 2(a0) -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: ret +; ZVFH-LABEL: vrgather_shuffle_vx_v4f16_load: +; ZVFH: # %bb.0: +; ZVFH-NEXT: flh fa5, 2(a0) +; ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVFH-NEXT: vfmv.v.f v8, fa5 +; ZVFH-NEXT: ret +; +; ZVFHMIN-LABEL: vrgather_shuffle_vx_v4f16_load: +; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: lh a0, 2(a0) +; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vmv.v.x v8, a0 +; ZVFHMIN-NEXT: ret %v = load <4 x half>, ptr %p %s = shufflevector <4 x half> %v, <4 x half> undef, <4 x i32> ret <4 x half> %s diff --git a/llvm/test/CodeGen/Thumb2/mve-extractstore.ll b/llvm/test/CodeGen/Thumb2/mve-extractstore.ll index 941ae78cc9a79..18ff9034a4530 100644 --- a/llvm/test/CodeGen/Thumb2/mve-extractstore.ll +++ b/llvm/test/CodeGen/Thumb2/mve-extractstore.ll @@ -5,10 +5,9 @@ define half @extret1_f16_sf(<8 x half> %a, <8 x half> %b, ptr nocapture %p) { ; CHECK-LABEL: extret1_f16_sf: ; CHECK: @ %bb.0: ; CHECK-NEXT: vmov d0, r0, r1 -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: ldrh.w r0, [sp, #2] +; CHECK-NEXT: vadd.f16 q0, q0, r0 ; CHECK-NEXT: ldr r0, [sp, #16] -; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vmovx.f16 s0, s0 ; CHECK-NEXT: vstr.16 s0, [r0] ; CHECK-NEXT: vmov r0, s0 @@ -22,11 +21,10 @@ define half @extret1_f16_sf(<8 x half> %a, <8 x half> %b, ptr nocapture %p) { define half @extret4_f16_sf(<8 x half> %a, <8 x half> %b, ptr nocapture %p) { ; CHECK-LABEL: extret4_f16_sf: ; CHECK: @ %bb.0: -; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: ldrh.w r0, [sp, #8] ; CHECK-NEXT: vmov d1, r2, r3 -; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vadd.f16 q0, q0, r0 ; CHECK-NEXT: ldr r0, [sp, #16] -; CHECK-NEXT: vadd.f16 q0, q0, q1 ; CHECK-NEXT: vstr.16 s2, [r0] ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: bx lr @@ -98,11 +96,10 @@ define arm_aapcs_vfpcc <8 x half> @extret4_v8f16_hf(<8 x half> %a, <8 x half> %b define float @extret1_f32_sf(<4 x float> %a, <4 x float> %b, ptr nocapture %p) { ; CHECK-LABEL: extret1_f32_sf: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov d0, r0, r1 -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldr s1, [sp, #4] +; CHECK-NEXT: vmov d2, r0, r1 ; CHECK-NEXT: ldr r1, [sp, #16] -; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: vadd.f32 q0, q1, q0 ; CHECK-NEXT: vmov r0, s1 ; CHECK-NEXT: vstr s1, [r1] ; CHECK-NEXT: bx lr @@ -115,11 +112,10 @@ define float @extret1_f32_sf(<4 x float> %a, <4 x float> %b, ptr nocapture %p) { define float @extret2_f32_sf(<4 x float> %a, <4 x float> %b, ptr nocapture %p) { ; CHECK-LABEL: extret2_f32_sf: ; CHECK: @ %bb.0: -; CHECK-NEXT: mov r0, sp -; CHECK-NEXT: vmov d1, r2, r3 -; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldr s2, [sp, #8] +; CHECK-NEXT: vmov d3, r2, r3 ; CHECK-NEXT: ldr r1, [sp, #16] -; CHECK-NEXT: vadd.f32 q0, q0, q1 +; CHECK-NEXT: vadd.f32 q0, q1, q0 ; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vstr s2, [r1] ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll b/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll index 5f56a82f3c511..78c754f712bfa 100644 --- a/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll +++ b/llvm/test/CodeGen/Thumb2/mve-insertshuffleload.ll @@ -128,9 +128,11 @@ define <8 x i16> @inserti8_last_zext(ptr %p) { define <8 x i32> @inserti32_first(ptr %p) { ; CHECKLE-LABEL: inserti32_first: ; CHECKLE: @ %bb.0: +; CHECKLE-NEXT: ldr r1, [r0, #16] ; CHECKLE-NEXT: vldrw.u32 q2, [r0, #20] -; CHECKLE-NEXT: vldr s4, [r0, #16] ; CHECKLE-NEXT: vldrw.u32 q0, [r0] +; CHECKLE-NEXT: vmov.32 q1[3], r1 +; CHECKLE-NEXT: vmov.f32 s4, s7 ; CHECKLE-NEXT: vmov.f32 s5, s8 ; CHECKLE-NEXT: vmov.f32 s6, s9 ; CHECKLE-NEXT: vmov.f32 s7, s10 @@ -138,14 +140,16 @@ define <8 x i32> @inserti32_first(ptr %p) { ; ; CHECKBE-LABEL: inserti32_first: ; CHECKBE: @ %bb.0: -; CHECKBE-NEXT: vldrw.u32 q3, [r0, #20] ; CHECKBE-NEXT: vldrb.u8 q1, [r0] -; CHECKBE-NEXT: vldr s8, [r0, #16] -; CHECKBE-NEXT: vmov.f32 s9, s12 +; CHECKBE-NEXT: ldr r1, [r0, #16] +; CHECKBE-NEXT: vldrw.u32 q2, [r0, #20] ; CHECKBE-NEXT: vrev64.8 q0, q1 -; CHECKBE-NEXT: vmov.f32 s10, s13 -; CHECKBE-NEXT: vmov.f32 s11, s14 -; CHECKBE-NEXT: vrev64.32 q1, q2 +; CHECKBE-NEXT: vmov.32 q1[3], r1 +; CHECKBE-NEXT: vmov.f32 s12, s7 +; CHECKBE-NEXT: vmov.f32 s13, s8 +; CHECKBE-NEXT: vmov.f32 s14, s9 +; CHECKBE-NEXT: vmov.f32 s15, s10 +; CHECKBE-NEXT: vrev64.32 q1, q3 ; CHECKBE-NEXT: bx lr %q = getelementptr inbounds i8, ptr %p, i32 4 %l1 = load <8 x i32>, ptr %q @@ -158,24 +162,28 @@ define <8 x i32> @inserti32_first(ptr %p) { define <8 x i32> @inserti32_last(ptr %p) { ; CHECKLE-LABEL: inserti32_last: ; CHECKLE: @ %bb.0: -; CHECKLE-NEXT: vldrw.u32 q2, [r0] -; CHECKLE-NEXT: vldr s3, [r0, #16] +; CHECKLE-NEXT: ldr r1, [r0, #16] +; CHECKLE-NEXT: vldrw.u32 q0, [r0] ; CHECKLE-NEXT: vldrw.u32 q1, [r0, #20] -; CHECKLE-NEXT: vmov.f32 s0, s9 -; CHECKLE-NEXT: vmov.f32 s1, s10 -; CHECKLE-NEXT: vmov.f32 s2, s11 +; CHECKLE-NEXT: vmov.32 q2[0], r1 +; CHECKLE-NEXT: vmov.f32 s0, s1 +; CHECKLE-NEXT: vmov.f32 s1, s2 +; CHECKLE-NEXT: vmov.f32 s2, s3 +; CHECKLE-NEXT: vmov.f32 s3, s8 ; CHECKLE-NEXT: bx lr ; ; CHECKBE-LABEL: inserti32_last: ; CHECKBE: @ %bb.0: -; CHECKBE-NEXT: vldrw.u32 q3, [r0] -; CHECKBE-NEXT: vldrb.u8 q0, [r0, #20] -; CHECKBE-NEXT: vldr s11, [r0, #16] -; CHECKBE-NEXT: vmov.f32 s8, s13 -; CHECKBE-NEXT: vrev64.8 q1, q0 -; CHECKBE-NEXT: vmov.f32 s9, s14 -; CHECKBE-NEXT: vmov.f32 s10, s15 -; CHECKBE-NEXT: vrev64.32 q0, q2 +; CHECKBE-NEXT: ldr r1, [r0, #16] +; CHECKBE-NEXT: vldrw.u32 q0, [r0] +; CHECKBE-NEXT: vldrb.u8 q2, [r0, #20] +; CHECKBE-NEXT: vmov.32 q1[0], r1 +; CHECKBE-NEXT: vmov.f32 s12, s1 +; CHECKBE-NEXT: vmov.f32 s15, s4 +; CHECKBE-NEXT: vrev64.8 q1, q2 +; CHECKBE-NEXT: vmov.f32 s13, s2 +; CHECKBE-NEXT: vmov.f32 s14, s3 +; CHECKBE-NEXT: vrev64.32 q0, q3 ; CHECKBE-NEXT: bx lr %q = getelementptr inbounds i8, ptr %p, i32 32 %l1 = load <8 x i32>, ptr %p diff --git a/llvm/test/CodeGen/X86/SwizzleShuff.ll b/llvm/test/CodeGen/X86/SwizzleShuff.ll index 0cfafdd86863e..9f3dffd75cfa8 100644 --- a/llvm/test/CodeGen/X86/SwizzleShuff.ll +++ b/llvm/test/CodeGen/X86/SwizzleShuff.ll @@ -19,7 +19,7 @@ define void @pull_bitcast(ptr %pA, ptr %pB) { define <4 x i32> @multi_use_swizzle(ptr %pA, ptr %pB) { ; CHECK-LABEL: multi_use_swizzle: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm0 +; CHECK-NEXT: vbroadcastss 4(%rdi), %xmm0 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[1,2] ; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,3,2,2] ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,0,2] diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll index 0bfd8921e8b42..e5eb366fe22e3 100644 --- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll @@ -370,12 +370,12 @@ define <4 x i32> @load_splat_4i32_4i32_1111(ptr %ptr) nounwind uwtable readnone ; X86-LABEL: load_splat_4i32_4i32_1111: ; X86: ## %bb.0: ## %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,1,1] +; X86-NEXT: vbroadcastss 4(%eax), %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: load_splat_4i32_4i32_1111: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,1,1] +; X64-NEXT: vbroadcastss 4(%rdi), %xmm0 ; X64-NEXT: retq entry: %ld = load <4 x i32>, ptr %ptr @@ -477,7 +477,7 @@ define <2 x i64> @load_splat_2i64_2i64_1111(ptr %ptr) nounwind uwtable readnone ; ; X64-LABEL: load_splat_2i64_2i64_1111: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3] +; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; X64-NEXT: retq entry: %ld = load <2 x i64>, ptr %ptr diff --git a/llvm/test/CodeGen/X86/avx.ll b/llvm/test/CodeGen/X86/avx.ll index 4ce092c099b08..3b52773db8673 100644 --- a/llvm/test/CodeGen/X86/avx.ll +++ b/llvm/test/CodeGen/X86/avx.ll @@ -81,13 +81,15 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, ptr nocap ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll $4, %ecx -; X86-NEXT: vinsertps $0, 12(%eax,%ecx), %xmm0, %xmm0 ## xmm0 = mem[0],xmm0[1,2,3] +; X86-NEXT: vmovss 12(%eax,%ecx), %xmm1 ## xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; X86-NEXT: retl ; ; X64-LABEL: insertps_from_vector_load_offset_2: ; X64: ## %bb.0: ; X64-NEXT: shlq $4, %rsi -; X64-NEXT: vinsertps $0, 12(%rdi,%rsi), %xmm0, %xmm0 ## xmm0 = mem[0],xmm0[1,2,3] +; X64-NEXT: vmovss 12(%rdi,%rsi), %xmm1 ## xmm1 = mem[0],zero,zero,zero +; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; X64-NEXT: retq %1 = getelementptr inbounds <4 x float>, ptr %pb, i64 %index %2 = load <4 x float>, ptr %1, align 16 diff --git a/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll b/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll index e971d1e471bf7..49cc58c8e73a8 100644 --- a/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll +++ b/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll @@ -9,14 +9,16 @@ define void @test1(ptr %A, ptr %C) #0 { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vmovss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; X86-NEXT: vandps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: test1: ; X64: ## %bb.0: ; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vmovss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; X64-NEXT: vandps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vmovss %xmm0, (%rsi) ; X64-NEXT: retq %tmp2 = load <8 x float>, ptr %A, align 32 @@ -34,15 +36,15 @@ define void @test2(ptr %A, ptr %C) #0 { ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovaps (%ecx), %xmm0 -; X86-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vmovss {{.*#+}} xmm0 = [2147483647,0,0,0] +; X86-NEXT: vorps (%ecx), %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: test2: ; X64: ## %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 -; X64-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vmovss {{.*#+}} xmm0 = [2147483647,0,0,0] +; X64-NEXT: vorps (%rdi), %xmm0, %xmm0 ; X64-NEXT: vmovss %xmm0, (%rsi) ; X64-NEXT: retq %tmp2 = load <8 x float>, ptr %A, align 32 @@ -60,15 +62,15 @@ define void @test3(ptr %A, ptr %C) #0 { ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovaps (%ecx), %xmm0 -; X86-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vmovss {{.*#+}} xmm0 = [2147483647,0,0,0] +; X86-NEXT: vxorps (%ecx), %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: test3: ; X64: ## %bb.0: -; X64-NEXT: vmovaps (%rdi), %xmm0 -; X64-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vmovss {{.*#+}} xmm0 = [2147483647,0,0,0] +; X64-NEXT: vxorps (%rdi), %xmm0, %xmm0 ; X64-NEXT: vmovss %xmm0, (%rsi) ; X64-NEXT: retq %tmp2 = load <8 x float>, ptr %A, align 32 @@ -86,14 +88,16 @@ define void @test4(ptr %A, ptr %C) #0 { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: vmovaps (%ecx), %xmm0 -; X86-NEXT: vandnps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-NEXT: vmovss {{.*#+}} xmm1 = [2147483647,0,0,0] +; X86-NEXT: vandnps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: test4: ; X64: ## %bb.0: ; X64-NEXT: vmovaps (%rdi), %xmm0 -; X64-NEXT: vandnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vmovss {{.*#+}} xmm1 = [2147483647,0,0,0] +; X64-NEXT: vandnps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vmovss %xmm0, (%rsi) ; X64-NEXT: retq %tmp2 = load <8 x float>, ptr %A, align 32 diff --git a/llvm/test/CodeGen/X86/avx512-arith.ll b/llvm/test/CodeGen/X86/avx512-arith.ll index 0fae921b1ca83..20550fc4eb9fa 100644 --- a/llvm/test/CodeGen/X86/avx512-arith.ll +++ b/llvm/test/CodeGen/X86/avx512-arith.ll @@ -1158,7 +1158,7 @@ define <16 x i32> @masked_inc_test(<16 x i32> %i, <16 x i32> %mask1) nounwind re ; CHECK-LABEL: masked_inc_test: ; CHECK: # %bb.0: ; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1 -; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -1171,7 +1171,7 @@ define <16 x i32> @masked_dec_test(<16 x i32> %i, <16 x i32> %mask1) nounwind re ; CHECK-LABEL: masked_dec_test: ; CHECK: # %bb.0: ; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1 -; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} ; CHECK-NEXT: retq %mask = icmp ne <16 x i32> %mask1, zeroinitializer diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index a8574c0b7516c..80b4ae4942097 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -5824,8 +5824,10 @@ define <2 x double> @test_mm_mask_fmsub_round_sd(<2 x double> %__W, i8 zeroext % ; ; X64-LABEL: test_mm_mask_fmsub_round_sd: ; X64: # %bb.0: # %entry +; X64-NEXT: vmovsd {{.*#+}} xmm3 = [-0.0E+0,0.0E+0] +; X64-NEXT: vxorpd %xmm3, %xmm2, %xmm2 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} +; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} ; X64-NEXT: retq entry: %0 = extractelement <2 x double> %__W, i64 0 @@ -5876,8 +5878,10 @@ define <2 x double> @test_mm_maskz_fmsub_round_sd(i8 zeroext %__U, <2 x double> ; ; X64-LABEL: test_mm_maskz_fmsub_round_sd: ; X64: # %bb.0: # %entry +; X64-NEXT: vmovsd {{.*#+}} xmm3 = [-0.0E+0,0.0E+0] +; X64-NEXT: vxorpd %xmm3, %xmm2, %xmm2 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} +; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} ; X64-NEXT: retq entry: %0 = extractelement <2 x double> %__A, i64 0 @@ -5932,8 +5936,11 @@ define <2 x double> @test_mm_mask3_fmsub_round_sd(<2 x double> %__W, <2 x double ; ; X64-LABEL: test_mm_mask3_fmsub_round_sd: ; X64: # %bb.0: # %entry +; X64-NEXT: vmovsd {{.*#+}} xmm3 = [-0.0E+0,0.0E+0] +; X64-NEXT: vxorpd %xmm3, %xmm2, %xmm3 +; X64-NEXT: vfmadd213sd {rn-sae}, %xmm3, %xmm0, %xmm1 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vfmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; X64-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1} ; X64-NEXT: vmovapd %xmm2, %xmm0 ; X64-NEXT: retq entry: @@ -5986,8 +5993,10 @@ define <2 x double> @test_mm_mask_fnmadd_round_sd(<2 x double> %__W, i8 zeroext ; ; X64-LABEL: test_mm_mask_fnmadd_round_sd: ; X64: # %bb.0: # %entry +; X64-NEXT: vmovsd {{.*#+}} xmm3 = [-0.0E+0,0.0E+0] +; X64-NEXT: vxorpd %xmm3, %xmm1, %xmm1 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} +; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} ; X64-NEXT: retq entry: %0 = extractelement <2 x double> %__W, i64 0 @@ -6038,8 +6047,10 @@ define <2 x double> @test_mm_maskz_fnmadd_round_sd(i8 zeroext %__U, <2 x double> ; ; X64-LABEL: test_mm_maskz_fnmadd_round_sd: ; X64: # %bb.0: # %entry +; X64-NEXT: vmovsd {{.*#+}} xmm3 = [-0.0E+0,0.0E+0] +; X64-NEXT: vxorpd %xmm3, %xmm1, %xmm1 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} +; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} ; X64-NEXT: retq entry: %0 = extractelement <2 x double> %__A, i64 0 @@ -6094,8 +6105,10 @@ define <2 x double> @test_mm_mask3_fnmadd_round_sd(<2 x double> %__W, <2 x doubl ; ; X64-LABEL: test_mm_mask3_fnmadd_round_sd: ; X64: # %bb.0: # %entry +; X64-NEXT: vmovsd {{.*#+}} xmm3 = [-0.0E+0,0.0E+0] +; X64-NEXT: vxorpd %xmm3, %xmm1, %xmm1 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vfnmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; X64-NEXT: vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} ; X64-NEXT: vmovapd %xmm2, %xmm0 ; X64-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index aac5847061cbe..c2afa2b971d75 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -227,9 +227,9 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [9,0,3,0,5,0,7,1] -; CHECK-NEXT: vpermi2w (%rdi), %xmm2, %xmm3 +; CHECK-NEXT: vpbroadcastw 2(%rdi), %xmm2 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,8,11,8,13,8,15,9] +; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq @@ -243,10 +243,10 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [9,0,3,0,5,0,7,1] +; CHECK-NEXT: vpbroadcastw 2(%rdi), %xmm2 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [1,8,11,8,13,8,15,9] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermi2w (%rdi), %xmm2, %xmm1 {%k1} {z} +; CHECK-NEXT: vpermi2w 16(%rdi), %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp @@ -259,10 +259,9 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16 define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm0 = [9,7,9,6,9,4,3,2] -; CHECK-NEXT: vpermw (%rdi), %ymm0, %ymm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vpbroadcastw 18(%rdi), %xmm1 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm0 = [1,15,1,14,1,12,11,10] +; CHECK-NEXT: vpermi2w (%rdi), %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -271,12 +270,11 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(ptr %vp) { define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [9,7,9,6,9,4,3,2] +; CHECK-NEXT: vpbroadcastw 18(%rdi), %xmm2 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,15,1,14,1,12,11,10] +; CHECK-NEXT: vpermi2w (%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -288,11 +286,11 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [9,7,9,6,9,4,3,2] +; CHECK-NEXT: vpbroadcastw 18(%rdi), %xmm2 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [1,15,1,14,1,12,11,10] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vpermi2w (%rdi), %xmm2, %xmm1 {%k1} {z} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <16 x i16>, ptr %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -778,9 +776,9 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1] -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm3 -; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm3 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [23,22,20,22,28,20,11,17] +; CHECK-NEXT: vpbroadcastw 22(%rdi), %ymm3 +; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper @@ -795,10 +793,10 @@ define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1] -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [23,22,20,22,28,20,11,17] +; CHECK-NEXT: vpbroadcastw 22(%rdi), %ymm1 ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1041,8 +1039,9 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm0 -; CHECK-NEXT: vshufps $7, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[3,1],mem[0,0] +; CHECK-NEXT: vmovss (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovaps 16(%rdi), %xmm1 +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,1],xmm0[0,0] ; CHECK-NEXT: retq %vec = load <8 x i32>, ptr %vp %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1051,8 +1050,9 @@ define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask0(ptr %vp) { define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 -; CHECK-NEXT: vshufps $7, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[3,1],mem[0,0] +; CHECK-NEXT: vmovss (%rdi), %xmm2 # xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovaps 16(%rdi), %xmm3 +; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,1],xmm2[0,0] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} ; CHECK-NEXT: retq @@ -1066,8 +1066,9 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> % define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm1 -; CHECK-NEXT: vshufps $7, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[3,1],mem[0,0] +; CHECK-NEXT: vmovss (%rdi), %xmm1 # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 +; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,1],xmm1[0,0] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -1081,12 +1082,11 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [5,0,0,3] +; CHECK-NEXT: vpbroadcastd 20(%rdi), %xmm2 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,4,4,7] +; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i32>, ptr %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1098,11 +1098,11 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> % define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [5,0,0,3] +; CHECK-NEXT: vpbroadcastd 20(%rdi), %xmm2 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,4,4,7] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x i32>, ptr %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> @@ -1114,9 +1114,9 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 +; CHECK-NEXT: vmovd 16(%rdi), %xmm2 # xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,7,7,0] -; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3 +; CHECK-NEXT: vpermi2d 12(%rdi){1to4}, %xmm2, %xmm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq @@ -1130,10 +1130,10 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> % define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 +; CHECK-NEXT: vmovd 16(%rdi), %xmm2 # xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,7,7,0] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z} +; CHECK-NEXT: vpermi2d 12(%rdi){1to4}, %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x i32>, ptr %vp @@ -1609,9 +1609,10 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32 define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [13,0,0,6] -; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [5,8,8,14] +; CHECK-NEXT: vpbroadcastd 52(%rdi), %ymm0 +; CHECK-NEXT: vpermt2d (%rdi), %ymm1, %ymm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp @@ -1621,11 +1622,11 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(ptr %vp) { define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,0,0,6] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [5,8,8,14] +; CHECK-NEXT: vpbroadcastd 52(%rdi), %ymm3 +; CHECK-NEXT: vpermt2d (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp @@ -1638,10 +1639,11 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [13,0,0,6] +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [5,8,8,14] +; CHECK-NEXT: vpbroadcastd 52(%rdi), %ymm1 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: vpermt2d (%rdi), %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, ptr %vp @@ -1654,9 +1656,9 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [15,5,3,2,0,0,0,0] -; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3 +; CHECK-NEXT: vpbroadcastd 28(%rdi), %ymm2 +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [7,13,11,10,0,0,0,0] +; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper @@ -1671,10 +1673,10 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [15,5,3,2,0,0,0,0] +; CHECK-NEXT: vpbroadcastd 28(%rdi), %ymm2 +; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,13,11,10,0,0,0,0] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1853,8 +1855,9 @@ define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i define <2 x i64> @test_4xi64_to_2xi64_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_4xi64_to_2xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm0 -; CHECK-NEXT: vunpckhpd 16(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[1] +; CHECK-NEXT: vmovsd 24(%rdi), %xmm0 # xmm0 = mem[0],zero +; CHECK-NEXT: vmovsd 8(%rdi), %xmm1 # xmm1 = mem[0],zero +; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; CHECK-NEXT: retq %vec = load <4 x i64>, ptr %vp %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> @@ -1863,9 +1866,10 @@ define <2 x i64> @test_4xi64_to_2xi64_perm_mem_mask0(ptr %vp) { define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm2 +; CHECK-NEXT: vmovq 24(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-NEXT: vmovq 8(%rdi), %xmm3 # xmm3 = mem[0],zero ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 -; CHECK-NEXT: vpunpckhqdq 16(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[1],mem[1] +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0] ; CHECK-NEXT: retq %vec = load <4 x i64>, ptr %vp %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> @@ -1877,9 +1881,10 @@ define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> % define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm1 +; CHECK-NEXT: vmovq 24(%rdi), %xmm1 # xmm1 = mem[0],zero +; CHECK-NEXT: vmovq 8(%rdi), %xmm2 # xmm2 = mem[0],zero ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpunpckhqdq 16(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[1],mem[1] +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0] ; CHECK-NEXT: retq %vec = load <4 x i64>, ptr %vp %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> @@ -1891,10 +1896,10 @@ define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3] +; CHECK-NEXT: vmovq 8(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-NEXT: vmovq 16(%rdi), %xmm3 # xmm3 = mem[0],zero ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1} +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0] ; CHECK-NEXT: retq %vec = load <4 x i64>, ptr %vp %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> @@ -1906,10 +1911,10 @@ define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> % define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1 -; CHECK-NEXT: vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3] +; CHECK-NEXT: vmovq 8(%rdi), %xmm1 # xmm1 = mem[0],zero +; CHECK-NEXT: vmovq 16(%rdi), %xmm2 # xmm2 = mem[0],zero ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0] ; CHECK-NEXT: retq %vec = load <4 x i64>, ptr %vp %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> @@ -2374,17 +2379,17 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask0(ptr %vp, <4 x i64> define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [4,3,2,4] -; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 +; CHECK-FAST-NEXT: vmovq (%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,7,6,0] +; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1: ; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-PERLANE-NEXT: vpblendd $15, (%rdi), %ymm2, %ymm2 # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; CHECK-FAST-PERLANE-NEXT: vmovq (%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-FAST-PERLANE-NEXT: vpblendd $240, 32(%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,3,2,0] ; CHECK-FAST-PERLANE-NEXT: retq @@ -2398,17 +2403,17 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,3,2,4] +; CHECK-FAST-NEXT: vmovq (%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,6,0] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} +; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1: ; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-FAST-PERLANE-NEXT: vpblendd $15, (%rdi), %ymm1, %ymm1 # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; CHECK-FAST-PERLANE-NEXT: vmovq (%rdi), %xmm1 # xmm1 = mem[0],zero +; CHECK-FAST-PERLANE-NEXT: vpblendd $240, 32(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,3,2,0] ; CHECK-FAST-PERLANE-NEXT: retq @@ -2422,17 +2427,18 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [3,5,5,1] -; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 +; CHECK-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm2 +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [7,1,1,5] +; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2: ; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; CHECK-FAST-PERLANE-NEXT: vpbroadcastq 8(%rdi), %xmm2 +; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm3 +; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,0] ; CHECK-FAST-PERLANE-NEXT: retq @@ -2446,17 +2452,18 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [3,5,5,1] +; CHECK-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm2 +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,1,1,5] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} +; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2: ; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; CHECK-FAST-PERLANE-NEXT: vpbroadcastq 8(%rdi), %xmm1 +; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2 +; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,0] ; CHECK-FAST-PERLANE-NEXT: retq @@ -2470,16 +2477,16 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(ptr %vp) { ; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mem_mask3: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [7,0,0,2] -; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm0, %zmm0 -; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; CHECK-FAST-NEXT: vpbroadcastq 56(%rdi), %ymm1 +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [2,4,4,6] +; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm1, %ymm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mem_mask3: ; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-FAST-PERLANE-NEXT: vpalignr $8, 32(%rdi), %ymm0, %ymm0 # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,1,3] +; CHECK-FAST-PERLANE-NEXT: vbroadcastsd 56(%rdi), %ymm0 +; CHECK-FAST-PERLANE-NEXT: vunpcklpd (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,1,3] ; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x i64>, ptr %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2488,17 +2495,17 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(ptr %vp) { define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [7,0,0,2] +; CHECK-FAST-NEXT: vpbroadcastq 56(%rdi), %ymm2 +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [2,4,4,6] +; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} -; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3: ; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-FAST-PERLANE-NEXT: vpalignr $8, 32(%rdi), %ymm2, %ymm2 # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; CHECK-FAST-PERLANE-NEXT: vpbroadcastq 56(%rdi), %ymm2 +; CHECK-FAST-PERLANE-NEXT: vpunpcklqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0],mem[0],ymm2[2],mem[2] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,3] ; CHECK-FAST-PERLANE-NEXT: retq @@ -2512,16 +2519,17 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,0,0,2] +; CHECK-FAST-NEXT: vpbroadcastq 56(%rdi), %ymm2 +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [2,4,4,6] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} -; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} +; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3: ; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-FAST-PERLANE-NEXT: vpalignr $8, 32(%rdi), %ymm1, %ymm1 # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; CHECK-FAST-PERLANE-NEXT: vpbroadcastq 56(%rdi), %ymm1 +; CHECK-FAST-PERLANE-NEXT: vpunpcklqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,3] ; CHECK-FAST-PERLANE-NEXT: retq @@ -2567,17 +2575,17 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,2,7,1] +; CHECK-FAST-NEXT: vpbroadcastq 56(%rdi), %ymm2 +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [4,6,3,5] +; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} -; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5: ; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-FAST-PERLANE-NEXT: vpblendd $192, 32(%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] +; CHECK-FAST-PERLANE-NEXT: vpbroadcastq 56(%rdi), %ymm2 +; CHECK-FAST-PERLANE-NEXT: vpblendd $63, (%rdi), %ymm2, %ymm2 # ymm2 = mem[0,1,2,3,4,5],ymm2[6,7] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,2,3,1] ; CHECK-FAST-PERLANE-NEXT: retq @@ -2591,16 +2599,17 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,7,1] +; CHECK-FAST-NEXT: vpbroadcastq 56(%rdi), %ymm2 +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,6,3,5] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} -; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} +; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5: ; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-FAST-PERLANE-NEXT: vpblendd $192, 32(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; CHECK-FAST-PERLANE-NEXT: vpbroadcastq 56(%rdi), %ymm1 +; CHECK-FAST-PERLANE-NEXT: vpblendd $63, (%rdi), %ymm1, %ymm1 # ymm1 = mem[0,1,2,3,4,5],ymm1[6,7] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,2,3,1] ; CHECK-FAST-PERLANE-NEXT: retq @@ -2614,9 +2623,9 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(ptr %vp) { ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [7,2,3,2] -; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 -; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; CHECK-NEXT: vpbroadcastq 56(%rdi), %xmm1 +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,6,7,6] +; CHECK-NEXT: vpermi2q (%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2625,11 +2634,11 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(ptr %vp) { define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [7,2,3,2] +; CHECK-NEXT: vpbroadcastq 56(%rdi), %xmm2 +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,6,7,6] +; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} -; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2641,10 +2650,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,2,3,2] +; CHECK-NEXT: vpbroadcastq 56(%rdi), %xmm2 +; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,6,7,6] ; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} -; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2656,17 +2666,18 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [3,3,1,5] -; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 +; CHECK-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm2 +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [7,7,5,1] +; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm3 ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7: ; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] +; CHECK-FAST-PERLANE-NEXT: vpbroadcastq 8(%rdi), %xmm2 +; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm3 +; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,2,0,1] ; CHECK-FAST-PERLANE-NEXT: retq @@ -2680,17 +2691,18 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> % define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [3,3,1,5] +; CHECK-FAST-NEXT: vpbroadcastq 8(%rdi), %xmm2 +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,7,5,1] ; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} +; CHECK-FAST-NEXT: vpermi2q 32(%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7: ; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] +; CHECK-FAST-PERLANE-NEXT: vpbroadcastq 8(%rdi), %xmm1 +; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2 +; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1 ; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,2,0,1] ; CHECK-FAST-PERLANE-NEXT: retq @@ -2712,8 +2724,9 @@ define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(ptr %vp) { ; ; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_2xi64_perm_mem_mask0: ; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm0 -; CHECK-FAST-PERLANE-NEXT: vblendps $12, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[2,3] +; CHECK-FAST-PERLANE-NEXT: vpbroadcastq 32(%rdi), %xmm0 +; CHECK-FAST-PERLANE-NEXT: vpinsrq $1, 8(%rdi), %xmm0, %xmm1 +; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x i64>, ptr %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> @@ -2732,10 +2745,10 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> % ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0: ; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 -; CHECK-FAST-PERLANE-NEXT: vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3] +; CHECK-FAST-PERLANE-NEXT: vpbroadcastq 32(%rdi), %xmm2 +; CHECK-FAST-PERLANE-NEXT: vpinsrq $1, 8(%rdi), %xmm0, %xmm3 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %xmm1, %xmm1, %k1 -; CHECK-FAST-PERLANE-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1} +; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm0 {%k1} = xmm2[1],xmm3[1] ; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> @@ -2756,10 +2769,10 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0: ; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm1 -; CHECK-FAST-PERLANE-NEXT: vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3] +; CHECK-FAST-PERLANE-NEXT: vpbroadcastq 32(%rdi), %xmm1 +; CHECK-FAST-PERLANE-NEXT: vpinsrq $1, 8(%rdi), %xmm0, %xmm2 ; CHECK-FAST-PERLANE-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; CHECK-FAST-PERLANE-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm0 {%k1} {z} = xmm1[1],xmm2[1] ; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> @@ -2771,9 +2784,10 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 48(%rdi), %xmm2 +; CHECK-NEXT: vmovq 16(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-NEXT: vmovq 48(%rdi), %xmm3 # xmm3 = mem[0],zero ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 -; CHECK-NEXT: vpunpcklqdq 16(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0] +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0] ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> @@ -2785,9 +2799,10 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> % define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 48(%rdi), %xmm1 +; CHECK-NEXT: vmovq 16(%rdi), %xmm1 # xmm1 = mem[0],zero +; CHECK-NEXT: vmovq 48(%rdi), %xmm2 # xmm2 = mem[0],zero ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpunpcklqdq 16(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0] +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0] ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> @@ -2946,9 +2961,9 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm1 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,6,0,1] -; CHECK-NEXT: vpermi2ps (%rdi), %xmm1, %xmm0 +; CHECK-NEXT: vbroadcastss 8(%rdi), %xmm1 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [6,2,4,5] +; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -2957,9 +2972,9 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) { define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,0,1] -; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3 +; CHECK-NEXT: vbroadcastss 8(%rdi), %xmm2 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [6,2,4,5] +; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 ; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} @@ -2974,11 +2989,11 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [2,6,0,1] +; CHECK-NEXT: vbroadcastss 8(%rdi), %xmm2 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,2,4,5] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 -; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z} +; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp @@ -2991,12 +3006,12 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,7,7,2] -; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 -; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} +; CHECK-NEXT: vbroadcastss 12(%rdi), %xmm2 +; CHECK-NEXT: vbroadcastss 24(%rdi), %xmm3 +; CHECK-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 +; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -3008,12 +3023,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [2,7,7,2] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 -; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z} -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: vbroadcastss 12(%rdi), %xmm1 +; CHECK-NEXT: vbroadcastss 24(%rdi), %xmm2 +; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3] +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 +; CHECK-NEXT: vmovaps %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -3025,13 +3040,12 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,1,3,7] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 -; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vbroadcastss 28(%rdi), %xmm2 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [7,5,7,3] +; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 +; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -3043,12 +3057,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,1,3,7] -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 -; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vbroadcastss 28(%rdi), %xmm2 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [7,5,7,3] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 +; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -3060,10 +3074,9 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,3,5,3] -; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vbroadcastss 20(%rdi), %xmm1 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,7,1,7] +; CHECK-NEXT: vpermi2ps (%rdi), %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -3072,13 +3085,12 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) { define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,3,5,3] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 -; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vbroadcastss 20(%rdi), %xmm2 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [5,7,1,7] +; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 +; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -3090,12 +3102,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,3,5,3] -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 -; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vbroadcastss 20(%rdi), %xmm2 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [5,7,1,7] +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 +; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <8 x float>, ptr %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> @@ -3680,14 +3692,10 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [4,14,4,14] -; CHECK-NEXT: # xmm2 = mem[0,0] -; CHECK-NEXT: vmovaps 32(%rdi), %ymm3 -; CHECK-NEXT: vpermt2ps (%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 -; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vbroadcastss 48(%rdi), %xmm2 +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 +; CHECK-NEXT: vunpcklps 24(%rdi){1to4}, %xmm2, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> @@ -3699,14 +3707,10 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [4,14,4,14] -; CHECK-NEXT: # xmm2 = mem[0,0] -; CHECK-NEXT: vmovaps 32(%rdi), %ymm1 -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 -; CHECK-NEXT: vpermt2ps (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovaps %xmm1, %xmm0 -; CHECK-NEXT: vzeroupper +; CHECK-NEXT: vbroadcastss 48(%rdi), %xmm1 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 +; CHECK-NEXT: vunpcklps 24(%rdi){1to4}, %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> @@ -3718,9 +3722,10 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) { ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,3,15,9] -; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,3,15,9] +; CHECK-NEXT: vbroadcastss 12(%rdi), %xmm0 +; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm1, %ymm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp @@ -3730,12 +3735,12 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) { define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,3,15,9] -; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 -; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: vbroadcastss 12(%rdi), %xmm3 +; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm2, %ymm3 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 +; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp @@ -3748,11 +3753,12 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,3,15,9] -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 -; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,3,15,9] +; CHECK-NEXT: vbroadcastss 12(%rdi), %xmm1 +; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 +; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, ptr %vp @@ -3835,8 +3841,8 @@ define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask1(<4 x double> define <2 x double> @test_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %xmm0 -; CHECK-NEXT: vblendps $3, 16(%rdi), %xmm0, %xmm0 # xmm0 = mem[0,1],xmm0[2,3] +; CHECK-NEXT: vmovsd 16(%rdi), %xmm0 # xmm0 = mem[0],zero +; CHECK-NEXT: vmovhps 8(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[0,1] ; CHECK-NEXT: retq %vec = load <4 x double>, ptr %vp %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> @@ -3845,11 +3851,11 @@ define <2 x double> @test_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp) { define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %vec2, <2 x double> %mask) { ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %xmm2 -; CHECK-NEXT: vblendpd $1, 16(%rdi), %xmm2, %xmm2 # xmm2 = mem[0],xmm2[1] -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 -; CHECK-NEXT: vmovapd %xmm2, %xmm0 {%k1} +; CHECK-NEXT: vmovsd 8(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-NEXT: vmovsd 16(%rdi), %xmm3 # xmm3 = mem[0],zero +; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %xmm4, %xmm1, %k1 +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0] ; CHECK-NEXT: retq %vec = load <4 x double>, ptr %vp %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> @@ -3861,11 +3867,11 @@ define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %mask) { ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %xmm1 -; CHECK-NEXT: vblendpd $1, 16(%rdi), %xmm1, %xmm1 # xmm1 = mem[0],xmm1[1] -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1 -; CHECK-NEXT: vmovapd %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vmovsd 8(%rdi), %xmm1 # xmm1 = mem[0],zero +; CHECK-NEXT: vmovsd 16(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %xmm3, %xmm0, %k1 +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0] ; CHECK-NEXT: retq %vec = load <4 x double>, ptr %vp %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> @@ -3877,10 +3883,11 @@ define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %vec2, <2 x double> %mask) { ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd 16(%rdi), %xmm2 -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 -; CHECK-NEXT: vunpcklpd (%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0] +; CHECK-NEXT: vmovsd (%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-NEXT: vmovsd 16(%rdi), %xmm3 # xmm3 = mem[0],zero +; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %xmm4, %xmm1, %k1 +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0] ; CHECK-NEXT: retq %vec = load <4 x double>, ptr %vp %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> @@ -3892,10 +3899,11 @@ define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %mask) { ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd 16(%rdi), %xmm1 -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1 -; CHECK-NEXT: vunpcklpd (%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0] +; CHECK-NEXT: vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero +; CHECK-NEXT: vmovsd 16(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %xmm3, %xmm0, %k1 +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0] ; CHECK-NEXT: retq %vec = load <4 x double>, ptr %vp %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> @@ -4382,8 +4390,8 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp, define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vbroadcastsd 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [7,0,6,2] +; CHECK-FAST-NEXT: vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [7,0,6,0] ; CHECK-FAST-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3 ; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 @@ -4392,10 +4400,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4 ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1: ; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vpermpd $236, (%rdi), %ymm2 # ymm2 = mem[0,3,2,3] +; CHECK-FAST-PERLANE-NEXT: vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-FAST-PERLANE-NEXT: vblendpd $12, (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0,1],mem[2,3] ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 -; CHECK-FAST-PERLANE-NEXT: vshufpd $1, 32(%rdi){1to4}, %ymm2, %ymm0 {%k1} +; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm2[3,0,2,0] ; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4407,8 +4416,8 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vbroadcastsd 32(%rdi), %ymm2 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,0,6,2] +; CHECK-FAST-NEXT: vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,0,6,0] ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 ; CHECK-FAST-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z} @@ -4417,10 +4426,11 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1: ; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vpermpd $236, (%rdi), %ymm1 # ymm1 = mem[0,3,2,3] +; CHECK-FAST-PERLANE-NEXT: vmovsd 32(%rdi), %xmm1 # xmm1 = mem[0],zero +; CHECK-FAST-PERLANE-NEXT: vblendpd $12, (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0,1],mem[2,3] ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 -; CHECK-FAST-PERLANE-NEXT: vshufpd $1, 32(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z} +; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[3,0,2,0] ; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4432,21 +4442,22 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,2,3,4] -; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 -; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} -; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; CHECK-FAST-NEXT: vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [5,6,7,0] +; CHECK-FAST-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3 +; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 +; CHECK-FAST-NEXT: vmovapd %ymm3, %ymm0 {%k1} ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2: ; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovapd (%rdi), %ymm2 -; CHECK-FAST-PERLANE-NEXT: vperm2f128 $33, 32(%rdi), %ymm2, %ymm3 # ymm3 = ymm2[2,3],mem[0,1] +; CHECK-FAST-PERLANE-NEXT: vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-FAST-PERLANE-NEXT: vmovapd (%rdi), %ymm3 +; CHECK-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[0,1] ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm4, %ymm1, %k1 -; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm2[1],ymm3[0],ymm2[3],ymm3[2] +; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm3[1],ymm2[0],ymm3[3],ymm2[2] ; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4458,20 +4469,22 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,2,3,4] -; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 -; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} -; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; CHECK-FAST-NEXT: vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [5,6,7,0] +; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 +; CHECK-FAST-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z} +; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2: ; CHECK-FAST-PERLANE: # %bb.0: -; CHECK-FAST-PERLANE-NEXT: vmovapd (%rdi), %ymm1 -; CHECK-FAST-PERLANE-NEXT: vperm2f128 $33, 32(%rdi), %ymm1, %ymm2 # ymm2 = ymm1[2,3],mem[0,1] +; CHECK-FAST-PERLANE-NEXT: vmovsd 32(%rdi), %xmm1 # xmm1 = mem[0],zero +; CHECK-FAST-PERLANE-NEXT: vmovapd (%rdi), %ymm2 +; CHECK-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 -; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm1[1],ymm2[0],ymm1[3],ymm2[2] +; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm2[1],ymm1[0],ymm2[3],ymm1[2] ; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4481,26 +4494,43 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, } define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp) { -; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,2,1,0] -; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 -; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovsd 32(%rdi), %xmm1 # xmm1 = mem[0],zero +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,6,5,4] +; CHECK-FAST-NEXT: vpermi2pd (%rdi), %ymm1, %ymm0 +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vmovsd 32(%rdi), %xmm0 # xmm0 = mem[0],zero +; CHECK-FAST-PERLANE-NEXT: vpermpd $24, (%rdi), %ymm1 # ymm1 = mem[0,2,1,0] +; CHECK-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x double>, ptr %vp %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> ret <4 x double> %res } define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { -; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3: -; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [4,2,1,0] -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 -; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} -; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,6,5,4] +; CHECK-FAST-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3 +; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 +; CHECK-FAST-NEXT: vmovapd %ymm3, %ymm0 {%k1} +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-FAST-PERLANE-NEXT: vpermpd $24, (%rdi), %ymm3 # ymm3 = mem[0,2,1,0] +; CHECK-FAST-PERLANE-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3] +; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 +; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm2, %ymm0 {%k1} +; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -4509,14 +4539,25 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 } define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %mask) { -; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,2,1,0] -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 -; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} -; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovsd 32(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,6,5,4] +; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 +; CHECK-FAST-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z} +; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vmovsd 32(%rdi), %xmm1 # xmm1 = mem[0],zero +; CHECK-FAST-PERLANE-NEXT: vpermpd $24, (%rdi), %ymm2 # ymm2 = mem[0,2,1,0] +; CHECK-FAST-PERLANE-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3] +; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 +; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm1, %ymm0 {%k1} {z} +; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -4559,14 +4600,25 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp, } define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { -; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %ymm2 -; CHECK-NEXT: vperm2f128 $33, 32(%rdi), %ymm2, %ymm2 # ymm2 = ymm2[2,3],mem[0,1] -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 -; CHECK-NEXT: vshufpd $14, 40(%rdi){1to4}, %ymm2, %ymm0 {%k1} -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovddup 40(%rdi), %xmm2 # xmm2 = mem[0,0] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [6,1,1,1] +; CHECK-FAST-NEXT: vpermi2pd 16(%rdi){1to4}, %ymm2, %ymm3 +; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 +; CHECK-FAST-NEXT: vmovapd %ymm3, %ymm0 {%k1} +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vmovddup 40(%rdi), %xmm2 # xmm2 = mem[0,0] +; CHECK-FAST-PERLANE-NEXT: vbroadcastsd 16(%rdi), %ymm3 +; CHECK-FAST-PERLANE-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3] +; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 +; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,1] +; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -4575,14 +4627,25 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(ptr %vp, <4 } define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(ptr %vp, <4 x double> %mask) { -; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %ymm1 -; CHECK-NEXT: vperm2f128 $33, 32(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[2,3],mem[0,1] -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 -; CHECK-NEXT: vshufpd $14, 40(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z} -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovddup 40(%rdi), %xmm2 # xmm2 = mem[0,0] +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [6,1,1,1] +; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 +; CHECK-FAST-NEXT: vpermi2pd 16(%rdi){1to4}, %ymm2, %ymm1 {%k1} {z} +; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vmovddup 40(%rdi), %xmm1 # xmm1 = mem[0,0] +; CHECK-FAST-PERLANE-NEXT: vbroadcastsd 16(%rdi), %ymm2 +; CHECK-FAST-PERLANE-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3] +; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 +; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,1] +; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -4591,26 +4654,47 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(ptr %vp, } define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp) { -; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd 32(%rdi), %ymm1 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,4,1] -; CHECK-NEXT: vpermi2pd (%rdi), %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovsd (%rdi), %xmm0 # xmm0 = mem[0],zero +; CHECK-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,6,2,5] +; CHECK-FAST-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0 +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vmovsd (%rdi), %xmm0 # xmm0 = mem[0],zero +; CHECK-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-FAST-PERLANE-NEXT: vpermpd $104, 32(%rdi), %ymm1 # ymm1 = mem[0,2,2,1] +; CHECK-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x double>, ptr %vp %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> ret <4 x double> %res } define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { -; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,2,4,1] -; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovsd (%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [4,6,2,5] +; CHECK-FAST-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3 +; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 +; CHECK-FAST-NEXT: vmovapd %ymm3, %ymm0 {%k1} +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vmovsd (%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; CHECK-FAST-PERLANE-NEXT: vpermpd $104, 32(%rdi), %ymm3 # ymm3 = mem[0,2,2,1] +; CHECK-FAST-PERLANE-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3] +; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 +; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm2, %ymm0 {%k1} +; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -4619,15 +4703,27 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 } define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 x double> %mask) { -; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 -; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,1] -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 -; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovapd %ymm1, %ymm0 -; CHECK-NEXT: retq +; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6: +; CHECK-FAST: # %bb.0: +; CHECK-FAST-NEXT: vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero +; CHECK-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 +; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,6,2,5] +; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 +; CHECK-FAST-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z} +; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-FAST-NEXT: retq +; +; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6: +; CHECK-FAST-PERLANE: # %bb.0: +; CHECK-FAST-PERLANE-NEXT: vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero +; CHECK-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; CHECK-FAST-PERLANE-NEXT: vpermpd $104, 32(%rdi), %ymm2 # ymm2 = mem[0,2,2,1] +; CHECK-FAST-PERLANE-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3] +; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 +; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm1, %ymm0 {%k1} {z} +; CHECK-FAST-PERLANE-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -4668,8 +4764,8 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(ptr %vp, define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %xmm0 -; CHECK-NEXT: vshufpd $1, 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[0] +; CHECK-NEXT: vmovsd 8(%rdi), %xmm0 # xmm0 = mem[0],zero +; CHECK-NEXT: vmovhps 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[0,1] ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> @@ -4678,10 +4774,11 @@ define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp) { define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %vec2, <2 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %xmm2 -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 -; CHECK-NEXT: vshufpd $1, 48(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[1],mem[0] +; CHECK-NEXT: vmovsd 8(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-NEXT: vmovsd 48(%rdi), %xmm3 # xmm3 = mem[0],zero +; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %xmm4, %xmm1, %k1 +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} = xmm2[0],xmm3[0] ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> @@ -4693,10 +4790,11 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %xmm1 -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1 -; CHECK-NEXT: vshufpd $1, 48(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[1],mem[0] +; CHECK-NEXT: vmovsd 8(%rdi), %xmm1 # xmm1 = mem[0],zero +; CHECK-NEXT: vmovsd 48(%rdi), %xmm2 # xmm2 = mem[0],zero +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %xmm3, %xmm0, %k1 +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm1[0],xmm2[0] ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> @@ -4711,7 +4809,7 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 ; CHECK-NEXT: vmovddup 8(%rdi), %xmm2 # xmm2 = mem[0,0] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 -; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0] +; CHECK-NEXT: vunpcklpd 32(%rdi){1to2}, %xmm2, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> @@ -4726,7 +4824,7 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, ; CHECK-NEXT: vmovddup 8(%rdi), %xmm1 # xmm1 = mem[0,0] ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1 -; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0] +; CHECK-NEXT: vunpcklpd 32(%rdi){1to2}, %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <8 x double>, ptr %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll index b7b1212e76722..0df466cdab5bb 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll @@ -826,7 +826,8 @@ define <2 x double> @test_2xdouble_zero_masked_shuff_mask1(<2 x double> %vec1, < define <2 x double> @test_2xdouble_shuff_mem_mask0(<2 x double> %vec1, ptr %vec2p) { ; CHECK-LABEL: test_2xdouble_shuff_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],mem[0] +; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] ; CHECK-NEXT: retq %vec2 = load <2 x double>, ptr %vec2p %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> @@ -835,9 +836,10 @@ define <2 x double> @test_2xdouble_shuff_mem_mask0(<2 x double> %vec1, ptr %vec2 define <2 x double> @test_2xdouble_masked_shuff_mem_mask0(<2 x double> %vec1, ptr %vec2p, <2 x double> %vec3, <2 x double> %mask) { ; CHECK-LABEL: test_2xdouble_masked_shuff_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 -; CHECK-NEXT: vshufpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[0] +; CHECK-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %xmm4, %xmm2, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} xmm1 {%k1} = xmm0[1],xmm3[0] ; CHECK-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-NEXT: retq %vec2 = load <2 x double>, ptr %vec2p @@ -850,9 +852,10 @@ define <2 x double> @test_2xdouble_masked_shuff_mem_mask0(<2 x double> %vec1, pt define <2 x double> @test_2xdouble_zero_masked_shuff_mem_mask0(<2 x double> %vec1, ptr %vec2p, <2 x double> %mask) { ; CHECK-LABEL: test_2xdouble_zero_masked_shuff_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1 -; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[0] +; CHECK-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm2[0] ; CHECK-NEXT: retq %vec2 = load <2 x double>, ptr %vec2p %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> @@ -864,9 +867,10 @@ define <2 x double> @test_2xdouble_zero_masked_shuff_mem_mask0(<2 x double> %vec define <2 x double> @test_2xdouble_masked_shuff_mem_mask1(<2 x double> %vec1, ptr %vec2p, <2 x double> %vec3, <2 x double> %mask) { ; CHECK-LABEL: test_2xdouble_masked_shuff_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 -; CHECK-NEXT: vshufpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[0] +; CHECK-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %xmm4, %xmm2, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} xmm1 {%k1} = xmm0[1],xmm3[0] ; CHECK-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-NEXT: retq %vec2 = load <2 x double>, ptr %vec2p @@ -879,9 +883,10 @@ define <2 x double> @test_2xdouble_masked_shuff_mem_mask1(<2 x double> %vec1, pt define <2 x double> @test_2xdouble_zero_masked_shuff_mem_mask1(<2 x double> %vec1, ptr %vec2p, <2 x double> %mask) { ; CHECK-LABEL: test_2xdouble_zero_masked_shuff_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1 -; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[0] +; CHECK-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 +; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm2[0] ; CHECK-NEXT: retq %vec2 = load <2 x double>, ptr %vec2p %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/unpack.ll b/llvm/test/CodeGen/X86/avx512-shuffles/unpack.ll index d0b183dfeae6e..73e2c7e564c75 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/unpack.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/unpack.ll @@ -826,7 +826,7 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask1(<2 x double> %ve define <2 x double> @test_2xdouble_unpack_low_mem_mask0(<2 x double> %vec1, ptr %vec2p) { ; CHECK-LABEL: test_2xdouble_unpack_low_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; CHECK-NEXT: retq %vec2 = load <2 x double>, ptr %vec2p %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> @@ -835,9 +835,10 @@ define <2 x double> @test_2xdouble_unpack_low_mem_mask0(<2 x double> %vec1, ptr define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask0(<2 x double> %vec1, ptr %vec2p, <2 x double> %vec3, <2 x double> %mask) { ; CHECK-LABEL: test_2xdouble_masked_unpack_low_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] +; CHECK-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %xmm4, %xmm2, %k1 +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0] ; CHECK-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-NEXT: retq %vec2 = load <2 x double>, ptr %vec2p @@ -850,9 +851,10 @@ define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask0(<2 x double> %vec define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask0(<2 x double> %vec1, ptr %vec2p, <2 x double> %mask) { ; CHECK-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1 -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] +; CHECK-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0] ; CHECK-NEXT: retq %vec2 = load <2 x double>, ptr %vec2p %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> @@ -864,9 +866,10 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask0(<2 x double> define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask1(<2 x double> %vec1, ptr %vec2p, <2 x double> %vec3, <2 x double> %mask) { ; CHECK-LABEL: test_2xdouble_masked_unpack_low_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] +; CHECK-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %xmm4, %xmm2, %k1 +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0] ; CHECK-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-NEXT: retq %vec2 = load <2 x double>, ptr %vec2p @@ -879,9 +882,10 @@ define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask1(<2 x double> %vec define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask1(<2 x double> %vec1, ptr %vec2p, <2 x double> %mask) { ; CHECK-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1 -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] +; CHECK-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0] ; CHECK-NEXT: retq %vec2 = load <2 x double>, ptr %vec2p %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> @@ -2223,7 +2227,7 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask1(<2 x double> %v define <2 x double> @test_2xdouble_unpack_high_mem_mask0(<2 x double> %vec1, ptr %vec2p) { ; CHECK-LABEL: test_2xdouble_unpack_high_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; CHECK-NEXT: vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm0 ; CHECK-NEXT: retq %vec2 = load <2 x double>, ptr %vec2p %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> @@ -2234,7 +2238,7 @@ define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask0(<2 x double> %ve ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] +; CHECK-NEXT: vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-NEXT: retq %vec2 = load <2 x double>, ptr %vec2p @@ -2249,7 +2253,7 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask0(<2 x double ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1 -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] +; CHECK-NEXT: vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec2 = load <2 x double>, ptr %vec2p %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> @@ -2263,7 +2267,7 @@ define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask1(<2 x double> %ve ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] +; CHECK-NEXT: vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm1 {%k1} ; CHECK-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-NEXT: retq %vec2 = load <2 x double>, ptr %vec2p @@ -2278,7 +2282,7 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask1(<2 x double ; CHECK: # %bb.0: ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1 -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] +; CHECK-NEXT: vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec2 = load <2 x double>, ptr %vec2p %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> diff --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll index f4eb5b952ae43..f93a1fff51f39 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-mov.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll @@ -1938,7 +1938,8 @@ define <4 x float> @regression2(ptr addrspace(1) %0, <4 x i32> %1, <4 x i32> %2, ; X64-NEXT: vmovw (%rsi), %xmm0 ; X64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 -; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; X64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; X64-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; X64-NEXT: retq ; @@ -1948,7 +1949,8 @@ define <4 x float> @regression2(ptr addrspace(1) %0, <4 x i32> %1, <4 x i32> %2, ; X86-NEXT: vmovw (%eax), %xmm0 ; X86-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 -; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; X86-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; X86-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 ; X86-NEXT: retl %6 = load i8, ptr %4, align 1 @@ -2111,7 +2113,7 @@ define <16 x i32> @pr52561(<16 x i32> %a, <16 x i32> %b) "min-legal-vector-width ; X86-NEXT: andl $-32, %esp ; X86-NEXT: subl $32, %esp ; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; X86-NEXT: vpaddd 8(%ebp), %ymm1, %ymm1 +; X86-NEXT: vpaddd 36(%ebp){1to8}, %ymm1, %ymm1 ; X86-NEXT: vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112] ; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; X86-NEXT: vpaddd %ymm2, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll index d92e1a1e7b9d4..be02a6071ab58 100644 --- a/llvm/test/CodeGen/X86/bitreverse.ll +++ b/llvm/test/CodeGen/X86/bitreverse.ll @@ -467,8 +467,9 @@ define i8 @test_bitreverse_i8(i8 %a) { ; ; X86XOP-LABEL: test_bitreverse_i8: ; X86XOP: # %bb.0: -; X86XOP-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0, %xmm0 +; X86XOP-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86XOP-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86XOP-NEXT: vpperm %xmm0, %xmm1, %xmm0, %xmm0 ; X86XOP-NEXT: vmovd %xmm0, %eax ; X86XOP-NEXT: # kill: def $al killed $al killed $eax ; X86XOP-NEXT: retl @@ -533,8 +534,9 @@ define i4 @test_bitreverse_i4(i4 %a) { ; ; X86XOP-LABEL: test_bitreverse_i4: ; X86XOP: # %bb.0: -; X86XOP-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0, %xmm0 +; X86XOP-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86XOP-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86XOP-NEXT: vpperm %xmm0, %xmm1, %xmm0, %xmm0 ; X86XOP-NEXT: vmovd %xmm0, %eax ; X86XOP-NEXT: shrb $4, %al ; X86XOP-NEXT: # kill: def $al killed $al killed $eax diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll index 4b0e5441b4abf..dd62183904c88 100644 --- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll +++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll @@ -77,19 +77,15 @@ entry: ; FIXME: This could be 'movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]'. define <2 x double> @test_negative_zero_2(<2 x double> %A) { -; SSE2-LABEL: test_negative_zero_2: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],mem[1] -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_negative_zero_2: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] -; SSE41-NEXT: retq +; SSE-LABEL: test_negative_zero_2: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; SSE-NEXT: retq ; ; AVX-LABEL: test_negative_zero_2: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX-NEXT: retq entry: %0 = extractelement <2 x double> %A, i32 0 diff --git a/llvm/test/CodeGen/X86/combine-fabs.ll b/llvm/test/CodeGen/X86/combine-fabs.ll index 7aa6628cb7f39..1f2c0ee0c83d9 100644 --- a/llvm/test/CodeGen/X86/combine-fabs.ll +++ b/llvm/test/CodeGen/X86/combine-fabs.ll @@ -40,7 +40,8 @@ define <4 x float> @combine_vec_fabs_constant() { define float @combine_fabs_fabs(float %a) { ; SSE-LABEL: combine_fabs_fabs: ; SSE: # %bb.0: -; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; SSE-NEXT: andps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_fabs_fabs: @@ -73,7 +74,8 @@ define <4 x float> @combine_vec_fabs_fabs(<4 x float> %a) { define float @combine_fabs_fneg(float %a) { ; SSE-LABEL: combine_fabs_fneg: ; SSE: # %bb.0: -; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; SSE-NEXT: andps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_fabs_fneg: @@ -106,7 +108,8 @@ define <4 x float> @combine_vec_fabs_fneg(<4 x float> %a) { define float @combine_fabs_fcopysign(float %a, float %b) { ; SSE-LABEL: combine_fabs_fcopysign: ; SSE: # %bb.0: -; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; SSE-NEXT: andps %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_fabs_fcopysign: diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll index 42f09d04da26e..8a502eebf0d4e 100644 --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -1609,10 +1609,12 @@ define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) { ; ; XOP-LABEL: combine_vec_sdiv_by_pow2b_v2i64: ; XOP: # %bb.0: -; XOP-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; XOP-NEXT: vpinsrq $1, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm0, %xmm1 +; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm1 ; XOP-NEXT: vpsrlq $62, %xmm1, %xmm1 ; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm1 -; XOP-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOP-NEXT: vpinsrq $1, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm0, %xmm2 +; XOP-NEXT: vpshaq %xmm2, %xmm1, %xmm1 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; XOP-NEXT: retq %1 = sdiv <2 x i64> %x, @@ -1739,7 +1741,8 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) { ; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm2 ; XOP-NEXT: vpsrlq $62, %xmm2, %xmm2 ; XOP-NEXT: vpaddq %xmm2, %xmm0, %xmm2 -; XOP-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOP-NEXT: vpinsrq $1, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm0, %xmm3 +; XOP-NEXT: vpshaq %xmm3, %xmm2, %xmm2 ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOP-NEXT: vpshaq %xmm1, %xmm3, %xmm1 ; XOP-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -3051,7 +3054,8 @@ define <16 x i8> @pr38658(<16 x i8> %x) { ; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15],xmm2[1,3,5,7,9,11,13,15] ; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: vpinsrb $15, {{\.?LCPI[0-9]+_[0-9]+}}+15(%rip), %xmm0, %xmm2 +; XOP-NEXT: vpshlb %xmm2, %xmm0, %xmm0 ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll index 55715197830b1..58675f5156553 100644 --- a/llvm/test/CodeGen/X86/combine-udiv.ll +++ b/llvm/test/CodeGen/X86/combine-udiv.ll @@ -631,8 +631,12 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) { ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: psrlw $15, %xmm0 -; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: psrlw $7, %xmm0 +; SSE2-NEXT: movd {{.*#+}} xmm3 = [1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm3, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/commute-blend-avx2.ll b/llvm/test/CodeGen/X86/commute-blend-avx2.ll index 75511104580e9..2ae06f1ab43c0 100644 --- a/llvm/test/CodeGen/X86/commute-blend-avx2.ll +++ b/llvm/test/CodeGen/X86/commute-blend-avx2.ll @@ -70,7 +70,8 @@ declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i8) nou define <2 x double> @commute_fold_vblendpd_128(<2 x double> %a, ptr %b) #0 { ; CHECK-LABEL: commute_fold_vblendpd_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; CHECK-NEXT: retq %1 = load <2 x double>, ptr %b %2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1) @@ -81,7 +82,8 @@ declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nou define <4 x double> @commute_fold_vblendpd_256(<4 x double> %a, ptr %b) #0 { ; CHECK-LABEL: commute_fold_vblendpd_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] +; CHECK-NEXT: vbroadcastsd 24(%rdi), %ymm1 +; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; CHECK-NEXT: retq %1 = load <4 x double>, ptr %b %2 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %1, <4 x double> %a, i8 7) diff --git a/llvm/test/CodeGen/X86/commute-blend-sse41.ll b/llvm/test/CodeGen/X86/commute-blend-sse41.ll index 07d6a8ba22bb1..aa6a536755842 100644 --- a/llvm/test/CodeGen/X86/commute-blend-sse41.ll +++ b/llvm/test/CodeGen/X86/commute-blend-sse41.ll @@ -26,7 +26,7 @@ declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwi define <2 x double> @commute_fold_blendpd(<2 x double> %a, ptr %b) { ; CHECK-LABEL: commute_fold_blendpd: ; CHECK: # %bb.0: -; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; CHECK-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; CHECK-NEXT: retq %1 = load <2 x double>, ptr %b %2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1) @@ -54,11 +54,11 @@ define <4 x i32> @commute_fold_blend_v4i32(ptr %a, <4 x i32> %b) { define void @baz(ptr %arg, ptr %arg1) optsize { ; CHECK-LABEL: baz: ; CHECK: # %bb.0: # %bb -; CHECK-NEXT: movaps (%rdi), %xmm0 -; CHECK-NEXT: movaps {{.*#+}} xmm1 = [3,3] -; CHECK-NEXT: andps %xmm0, %xmm1 -; CHECK-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] -; CHECK-NEXT: movups %xmm1, (%rsi) +; CHECK-NEXT: movdqa (%rdi), %xmm0 +; CHECK-NEXT: pinsrq $1, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm1 +; CHECK-NEXT: pand %xmm0, %xmm1 +; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; CHECK-NEXT: movdqu %xmm1, (%rsi) ; CHECK-NEXT: retq bb: %tmp = load <2 x i64>, ptr %arg, align 16 diff --git a/llvm/test/CodeGen/X86/copysign-constant-magnitude.ll b/llvm/test/CodeGen/X86/copysign-constant-magnitude.ll index 0052359eedb50..0d25e85e2042f 100644 --- a/llvm/test/CodeGen/X86/copysign-constant-magnitude.ll +++ b/llvm/test/CodeGen/X86/copysign-constant-magnitude.ll @@ -13,7 +13,8 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" define double @mag_pos0_double(double %x) nounwind { ; CHECK-LABEL: mag_pos0_double: ; CHECK: ## %bb.0: -; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0] +; CHECK-NEXT: andps %xmm1, %xmm0 ; CHECK-NEXT: retq %y = call double @copysign(double 0.0, double %x) ret double %y @@ -25,7 +26,8 @@ define double @mag_pos0_double(double %x) nounwind { define double @mag_neg0_double(double %x) nounwind { ; CHECK-LABEL: mag_neg0_double: ; CHECK: ## %bb.0: -; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0] +; CHECK-NEXT: andps %xmm1, %xmm0 ; CHECK-NEXT: retq %y = call double @copysign(double -0.0, double %x) ret double %y @@ -41,7 +43,8 @@ define double @mag_pos1_double(double %x) nounwind { ; CHECK-LABEL: mag_pos1_double: ; CHECK: ## %bb.0: ; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0] +; CHECK-NEXT: orps %xmm1, %xmm0 ; CHECK-NEXT: retq %y = call double @copysign(double 1.0, double %x) ret double %y @@ -58,7 +61,8 @@ define double @mag_neg1_double(double %x) nounwind { ; CHECK-LABEL: mag_neg1_double: ; CHECK: ## %bb.0: ; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0] +; CHECK-NEXT: orps %xmm1, %xmm0 ; CHECK-NEXT: retq %y = call double @copysign(double -1.0, double %x) ret double %y @@ -73,7 +77,8 @@ define double @mag_neg1_double(double %x) nounwind { define float @mag_pos0_float(float %x) nounwind { ; CHECK-LABEL: mag_pos0_float: ; CHECK: ## %bb.0: -; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: andps %xmm1, %xmm0 ; CHECK-NEXT: retq %y = call float @copysignf(float 0.0, float %x) ret float %y @@ -85,7 +90,8 @@ define float @mag_pos0_float(float %x) nounwind { define float @mag_neg0_float(float %x) nounwind { ; CHECK-LABEL: mag_neg0_float: ; CHECK: ## %bb.0: -; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: andps %xmm1, %xmm0 ; CHECK-NEXT: retq %y = call float @copysignf(float -0.0, float %x) ret float %y @@ -103,7 +109,8 @@ define float @mag_pos1_float(float %x) nounwind { ; CHECK-LABEL: mag_pos1_float: ; CHECK: ## %bb.0: ; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: orps %xmm1, %xmm0 ; CHECK-NEXT: retq %y = call float @copysignf(float 1.0, float %x) ret float %y @@ -124,7 +131,8 @@ define float @mag_neg1_float(float %x) nounwind { ; CHECK-LABEL: mag_neg1_float: ; CHECK: ## %bb.0: ; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: orps %xmm1, %xmm0 ; CHECK-NEXT: retq %y = call float @copysignf(float -1.0, float %x) ret float %y diff --git a/llvm/test/CodeGen/X86/extract-concat.ll b/llvm/test/CodeGen/X86/extract-concat.ll index f12693469a3f6..dba07d80c6cd6 100644 --- a/llvm/test/CodeGen/X86/extract-concat.ll +++ b/llvm/test/CodeGen/X86/extract-concat.ll @@ -153,14 +153,16 @@ define <16 x i64> @load_catcat(ptr %p) { define <4 x i32> @cat_ext_straddle(ptr %px, ptr %py) { ; SSE-LABEL: cat_ext_straddle: ; SSE: # %bb.0: -; SSE-NEXT: movaps 16(%rdi), %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; AVX-LABEL: cat_ext_straddle: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX-NEXT: retq %x = load <6 x i32>, ptr %px %y = load <6 x i32>, ptr %py diff --git a/llvm/test/CodeGen/X86/extractelement-fp.ll b/llvm/test/CodeGen/X86/extractelement-fp.ll index 944f6bbfd0bfb..dfc81b3c3fb91 100644 --- a/llvm/test/CodeGen/X86/extractelement-fp.ll +++ b/llvm/test/CodeGen/X86/extractelement-fp.ll @@ -353,9 +353,9 @@ define float @select_fcmp_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z, ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $16, %esp -; X86-NEXT: vmovaps 8(%ebp), %xmm3 ; X86-NEXT: vcmpneq_oqss %xmm1, %xmm0, %xmm0 -; X86-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 +; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; X86-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ; X86-NEXT: flds {{[0-9]+}}(%esp) ; X86-NEXT: movl %ebp, %esp @@ -546,7 +546,8 @@ define float @fabs_v4f32(<4 x float> %x) nounwind { define double @fabs_v4f64(<4 x double> %x) nounwind { ; X64-LABEL: fabs_v4f64: ; X64: # %bb.0: -; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vmovsd {{.*#+}} xmm1 = [NaN,0.0E+0] +; X64-NEXT: vandps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll index 022b25a241533..e7758acf126d6 100644 --- a/llvm/test/CodeGen/X86/extractelement-load.ll +++ b/llvm/test/CodeGen/X86/extractelement-load.ll @@ -13,10 +13,16 @@ define i32 @t(ptr %val) nounwind { ; X86-SSE2-NEXT: movl 8(%eax), %eax ; X86-SSE2-NEXT: retl ; -; X64-LABEL: t: -; X64: # %bb.0: -; X64-NEXT: movl 8(%rdi), %eax -; X64-NEXT: retq +; X64-SSSE3-LABEL: t: +; X64-SSSE3: # %bb.0: +; X64-SSSE3-NEXT: movl 8(%rdi), %eax +; X64-SSSE3-NEXT: retq +; +; X64-AVX-LABEL: t: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X64-AVX-NEXT: vextractps $2, %xmm0, %eax +; X64-AVX-NEXT: retq %tmp2 = load <2 x i64>, ptr %val, align 16 ; <<2 x i64>> [#uses=1] %tmp3 = bitcast <2 x i64> %tmp2 to <4 x i32> ; <<4 x i32>> [#uses=1] %tmp4 = extractelement <4 x i32> %tmp3, i32 2 ; [#uses=1] @@ -76,9 +82,11 @@ bb: define i64 @t4(ptr %a) { ; X86-SSE2-LABEL: t4: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movl (%ecx), %eax -; X86-SSE2-NEXT: movl 4(%ecx), %edx +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: movd %xmm0, %edx ; X86-SSE2-NEXT: retl ; ; X64-LABEL: t4: @@ -126,8 +134,7 @@ define float @t6(ptr%a0) { ; X86-SSE2-NEXT: pushl %eax ; X86-SSE2-NEXT: .cfi_def_cfa_offset 8 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movaps (%eax), %xmm0 -; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE2-NEXT: xorps %xmm1, %xmm1 ; X86-SSE2-NEXT: cmpeqss %xmm0, %xmm1 ; X86-SSE2-NEXT: movss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] @@ -142,7 +149,7 @@ define float @t6(ptr%a0) { ; ; X64-SSSE3-LABEL: t6: ; X64-SSSE3: # %bb.0: -; X64-SSSE3-NEXT: movshdup {{.*#+}} xmm1 = mem[1,1,3,3] +; X64-SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-SSSE3-NEXT: xorps %xmm0, %xmm0 ; X64-SSSE3-NEXT: cmpeqss %xmm1, %xmm0 ; X64-SSSE3-NEXT: movss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] @@ -226,8 +233,7 @@ define float @PR43971_1(ptr%a0) nounwind { ; X86-SSE2: # %bb.0: # %entry ; X86-SSE2-NEXT: pushl %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movaps (%eax), %xmm0 -; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE2-NEXT: xorps %xmm1, %xmm1 ; X86-SSE2-NEXT: cmpeqss %xmm0, %xmm1 ; X86-SSE2-NEXT: movss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] @@ -241,7 +247,7 @@ define float @PR43971_1(ptr%a0) nounwind { ; ; X64-SSSE3-LABEL: PR43971_1: ; X64-SSSE3: # %bb.0: # %entry -; X64-SSSE3-NEXT: movshdup {{.*#+}} xmm1 = mem[1,1,3,3] +; X64-SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-SSSE3-NEXT: xorps %xmm0, %xmm0 ; X64-SSSE3-NEXT: cmpeqss %xmm1, %xmm0 ; X64-SSSE3-NEXT: movss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] @@ -317,12 +323,27 @@ define void @subextract_broadcast_load_constant(ptr nocapture %0, ptr nocapture ; X86-SSE2-NEXT: movw $-24160, (%eax) # imm = 0xA1A0 ; X86-SSE2-NEXT: retl ; -; X64-LABEL: subextract_broadcast_load_constant: -; X64: # %bb.0: -; X64-NEXT: movl $-1583308898, (%rdi) # imm = 0xA1A09F9E -; X64-NEXT: movw $-24674, (%rsi) # imm = 0x9F9E -; X64-NEXT: movw $-24160, (%rdx) # imm = 0xA1A0 -; X64-NEXT: retq +; X64-SSSE3-LABEL: subextract_broadcast_load_constant: +; X64-SSSE3: # %bb.0: +; X64-SSSE3-NEXT: movl $-1583308898, (%rdi) # imm = 0xA1A09F9E +; X64-SSSE3-NEXT: movw $-24674, (%rsi) # imm = 0x9F9E +; X64-SSSE3-NEXT: movw $-24160, (%rdx) # imm = 0xA1A0 +; X64-SSSE3-NEXT: retq +; +; X64-AVX1-LABEL: subextract_broadcast_load_constant: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: movl $-1583308898, (%rdi) # imm = 0xA1A09F9E +; X64-AVX1-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax +; X64-AVX1-NEXT: movw %ax, (%rsi) +; X64-AVX1-NEXT: movw $-24160, (%rdx) # imm = 0xA1A0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: subextract_broadcast_load_constant: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: movl $-1583308898, (%rdi) # imm = 0xA1A09F9E +; X64-AVX2-NEXT: movw $-24674, (%rsi) # imm = 0x9F9E +; X64-AVX2-NEXT: movw $-24160, (%rdx) # imm = 0xA1A0 +; X64-AVX2-NEXT: retq store i8 -98, ptr %0, align 1 %4 = getelementptr inbounds i8, ptr %0, i64 1 store i8 -97, ptr %4, align 1 diff --git a/llvm/test/CodeGen/X86/fabs.ll b/llvm/test/CodeGen/X86/fabs.ll index 82c82ac3e917e..d553cb7516fab 100644 --- a/llvm/test/CodeGen/X86/fabs.ll +++ b/llvm/test/CodeGen/X86/fabs.ll @@ -21,7 +21,8 @@ define float @test1(float %X) { ; ; X64-LABEL: test1: ; X64: # %bb.0: -; X64-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; X64-NEXT: andps %xmm1, %xmm0 ; X64-NEXT: retq %Y = call float @fabsf(float %X) readnone ret float %Y diff --git a/llvm/test/CodeGen/X86/fast-isel-fneg.ll b/llvm/test/CodeGen/X86/fast-isel-fneg.ll index 128f5ee0c318b..240da2c847849 100644 --- a/llvm/test/CodeGen/X86/fast-isel-fneg.ll +++ b/llvm/test/CodeGen/X86/fast-isel-fneg.ll @@ -40,8 +40,9 @@ define float @fneg_f32(float %x) nounwind { ; SSE2: # %bb.0: ; SSE2-NEXT: pushl %eax ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; SSE2-NEXT: movss %xmm0, (%esp) +; SSE2-NEXT: movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; SSE2-NEXT: xorps %xmm0, %xmm1 +; SSE2-NEXT: movss %xmm1, (%esp) ; SSE2-NEXT: flds (%esp) ; SSE2-NEXT: popl %eax ; SSE2-NEXT: retl diff --git a/llvm/test/CodeGen/X86/fma-signed-zero.ll b/llvm/test/CodeGen/X86/fma-signed-zero.ll index f9e4e9929c6c4..080469bd7d6da 100644 --- a/llvm/test/CodeGen/X86/fma-signed-zero.ll +++ b/llvm/test/CodeGen/X86/fma-signed-zero.ll @@ -10,7 +10,8 @@ define float @fneg_fma32(float %x, float %y, float %z) { ; CHECK-LABEL: fneg_fma32: ; CHECK: # %bb.0: ; CHECK-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 -; CHECK-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vmovss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %negx = fneg float %x %negz = fneg float %z @@ -37,7 +38,8 @@ define double @fneg_fma64(double %x, double %y, double %z) { ; CHECK-LABEL: fneg_fma64: ; CHECK: # %bb.0: ; CHECK-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 -; CHECK-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0] +; CHECK-NEXT: vxorpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %negx = fneg double %x %negz = fneg double %z diff --git a/llvm/test/CodeGen/X86/fp-fold.ll b/llvm/test/CodeGen/X86/fp-fold.ll index 74b5232a4df62..93716a48542ea 100644 --- a/llvm/test/CodeGen/X86/fp-fold.ll +++ b/llvm/test/CodeGen/X86/fp-fold.ll @@ -31,7 +31,7 @@ define float @fadd_produce_zero(float %x) { define float @fadd_reassociate(float %x) { ; CHECK-LABEL: fadd_reassociate: ; CHECK: # %bb.0: -; CHECK-NEXT: addss {{.*}}(%rip), %xmm0 +; CHECK-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %sum = fadd float %x, 8.0 %r = fadd reassoc nsz float %sum, 12.0 @@ -85,7 +85,7 @@ define float @fsub_neg_x_y(float %x, float %y) { define float @fsub_neg_y(float %x, float %y) { ; CHECK-LABEL: fsub_neg_y: ; CHECK: # %bb.0: -; CHECK-NEXT: mulss {{.*}}(%rip), %xmm0 +; CHECK-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %mul = fmul float %x, 5.0 %add = fadd float %mul, %y @@ -96,7 +96,7 @@ define float @fsub_neg_y(float %x, float %y) { define <4 x float> @fsub_neg_y_vector(<4 x float> %x, <4 x float> %y) { ; CHECK-LABEL: fsub_neg_y_vector: ; CHECK: # %bb.0: -; CHECK-NEXT: mulps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %mul = fmul <4 x float> %x, %add = fadd <4 x float> %mul, %y @@ -107,7 +107,7 @@ define <4 x float> @fsub_neg_y_vector(<4 x float> %x, <4 x float> %y) { define <4 x float> @fsub_neg_y_vector_nonuniform(<4 x float> %x, <4 x float> %y) { ; CHECK-LABEL: fsub_neg_y_vector_nonuniform: ; CHECK: # %bb.0: -; CHECK-NEXT: mulps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %mul = fmul <4 x float> %x, %add = fadd <4 x float> %mul, %y @@ -118,7 +118,7 @@ define <4 x float> @fsub_neg_y_vector_nonuniform(<4 x float> %x, <4 x float> %y) define float @fsub_neg_y_commute(float %x, float %y) { ; CHECK-LABEL: fsub_neg_y_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: mulss {{.*}}(%rip), %xmm0 +; CHECK-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %mul = fmul float %x, 5.0 %add = fadd float %y, %mul @@ -129,7 +129,7 @@ define float @fsub_neg_y_commute(float %x, float %y) { define <4 x float> @fsub_neg_y_commute_vector(<4 x float> %x, <4 x float> %y) { ; CHECK-LABEL: fsub_neg_y_commute_vector: ; CHECK: # %bb.0: -; CHECK-NEXT: mulps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %mul = fmul <4 x float> %x, %add = fadd <4 x float> %y, %mul @@ -142,7 +142,8 @@ define <4 x float> @fsub_neg_y_commute_vector(<4 x float> %x, <4 x float> %y) { define float @fsub_fadd_common_op_fneg(float %x, float %y) { ; CHECK-LABEL: fsub_fadd_common_op_fneg: ; CHECK: # %bb.0: -; CHECK-NEXT: xorps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: xorps %xmm1, %xmm0 ; CHECK-NEXT: retq %a = fadd float %x, %y %r = fsub reassoc nsz float %y, %a @@ -154,7 +155,7 @@ define float @fsub_fadd_common_op_fneg(float %x, float %y) { define <4 x float> @fsub_fadd_common_op_fneg_vec(<4 x float> %x, <4 x float> %y) { ; CHECK-LABEL: fsub_fadd_common_op_fneg_vec: ; CHECK: # %bb.0: -; CHECK-NEXT: xorps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %a = fadd <4 x float> %x, %y %r = fsub nsz reassoc <4 x float> %y, %a @@ -167,7 +168,8 @@ define <4 x float> @fsub_fadd_common_op_fneg_vec(<4 x float> %x, <4 x float> %y) define float @fsub_fadd_common_op_fneg_commute(float %x, float %y) { ; CHECK-LABEL: fsub_fadd_common_op_fneg_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: xorps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: xorps %xmm1, %xmm0 ; CHECK-NEXT: retq %a = fadd float %y, %x %r = fsub reassoc nsz float %y, %a @@ -179,7 +181,7 @@ define float @fsub_fadd_common_op_fneg_commute(float %x, float %y) { define <4 x float> @fsub_fadd_common_op_fneg_commute_vec(<4 x float> %x, <4 x float> %y) { ; CHECK-LABEL: fsub_fadd_common_op_fneg_commute_vec: ; CHECK: # %bb.0: -; CHECK-NEXT: xorps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %a = fadd <4 x float> %y, %x %r = fsub reassoc nsz <4 x float> %y, %a @@ -233,7 +235,8 @@ define float @fsub_zero_nsz_1(float %x) { define float @fsub_zero_nsz_2(float %x) { ; CHECK-LABEL: fsub_zero_nsz_2: ; CHECK: # %bb.0: -; CHECK-NEXT: xorps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: xorps %xmm1, %xmm0 ; CHECK-NEXT: retq %r = fsub nsz float 0.0, %x ret float %r @@ -259,7 +262,7 @@ define float @fmul_one(float %x) { define float @fmul_x_const_const(float %x) { ; CHECK-LABEL: fmul_x_const_const: ; CHECK: # %bb.0: -; CHECK-NEXT: mulss {{.*}}(%rip), %xmm0 +; CHECK-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq %mul = fmul reassoc float %x, 9.0 %r = fmul reassoc float %mul, 4.0 diff --git a/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll b/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll index 71d49481ebb8e..5bd48d80354fc 100644 --- a/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll @@ -9,7 +9,8 @@ define float @f1(float %0, float %1, float %2) #0 { ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: pushq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 16 -; NOFMA-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; NOFMA-NEXT: movss {{.*#+}} xmm3 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; NOFMA-NEXT: xorps %xmm3, %xmm0 ; NOFMA-NEXT: callq fmaf@PLT ; NOFMA-NEXT: popq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 8 @@ -37,7 +38,8 @@ define double @f2(double %0, double %1, double %2) #0 { ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: pushq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 16 -; NOFMA-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; NOFMA-NEXT: movsd {{.*#+}} xmm3 = [-0.0E+0,0.0E+0] +; NOFMA-NEXT: xorps %xmm3, %xmm0 ; NOFMA-NEXT: callq fma@PLT ; NOFMA-NEXT: popq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 8 @@ -65,7 +67,8 @@ define float @f3(float %0, float %1, float %2) #0 { ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: pushq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 16 -; NOFMA-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; NOFMA-NEXT: movss {{.*#+}} xmm3 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; NOFMA-NEXT: xorps %xmm3, %xmm2 ; NOFMA-NEXT: callq fmaf@PLT ; NOFMA-NEXT: popq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 8 @@ -93,7 +96,8 @@ define double @f4(double %0, double %1, double %2) #0 { ; NOFMA: # %bb.0: # %entry ; NOFMA-NEXT: pushq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 16 -; NOFMA-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; NOFMA-NEXT: movsd {{.*#+}} xmm3 = [-0.0E+0,0.0E+0] +; NOFMA-NEXT: xorps %xmm3, %xmm2 ; NOFMA-NEXT: callq fma@PLT ; NOFMA-NEXT: popq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 8 @@ -184,7 +188,8 @@ define float @f7(float %0, float %1, float %2) #0 { ; NOFMA-NEXT: pushq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 16 ; NOFMA-NEXT: callq fmaf@PLT -; NOFMA-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; NOFMA-NEXT: movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; NOFMA-NEXT: xorps %xmm1, %xmm0 ; NOFMA-NEXT: popq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq @@ -192,13 +197,15 @@ define float @f7(float %0, float %1, float %2) #0 { ; FMA-AVX1-LABEL: f7: ; FMA-AVX1: # %bb.0: # %entry ; FMA-AVX1-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 -; FMA-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA-AVX1-NEXT: vmovss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; FMA-AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; FMA-AVX1-NEXT: retq ; ; FMA4-LABEL: f7: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 -; FMA4-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA4-NEXT: vmovss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; FMA4-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; FMA4-NEXT: retq ; ; FMA-AVX512-LABEL: f7: @@ -221,7 +228,8 @@ define double @f8(double %0, double %1, double %2) #0 { ; NOFMA-NEXT: pushq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 16 ; NOFMA-NEXT: callq fma@PLT -; NOFMA-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; NOFMA-NEXT: movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0] +; NOFMA-NEXT: xorps %xmm1, %xmm0 ; NOFMA-NEXT: popq %rax ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq @@ -229,13 +237,15 @@ define double @f8(double %0, double %1, double %2) #0 { ; FMA-LABEL: f8: ; FMA: # %bb.0: # %entry ; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 -; FMA-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA-NEXT: vmovsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0] +; FMA-NEXT: vxorpd %xmm1, %xmm0, %xmm0 ; FMA-NEXT: retq ; ; FMA4-LABEL: f8: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2 -; FMA4-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA4-NEXT: vmovsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0] +; FMA4-NEXT: vxorpd %xmm1, %xmm0, %xmm0 ; FMA4-NEXT: retq entry: %3 = call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %2, @@ -262,13 +272,15 @@ define float @f9(float %0, float %1, float %2) #0 { ; FMA-AVX1-LABEL: f9: ; FMA-AVX1: # %bb.0: # %entry ; FMA-AVX1-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 -; FMA-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA-AVX1-NEXT: vmovss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; FMA-AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; FMA-AVX1-NEXT: retq ; ; FMA4-LABEL: f9: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfnmsubss {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 -; FMA4-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA4-NEXT: vmovss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; FMA4-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; FMA4-NEXT: retq ; ; FMA-AVX512-LABEL: f9: @@ -304,13 +316,15 @@ define double @f10(double %0, double %1, double %2) #0 { ; FMA-LABEL: f10: ; FMA: # %bb.0: # %entry ; FMA-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 -; FMA-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA-NEXT: vmovsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0] +; FMA-NEXT: vxorpd %xmm1, %xmm0, %xmm0 ; FMA-NEXT: retq ; ; FMA4-LABEL: f10: ; FMA4: # %bb.0: # %entry ; FMA4-NEXT: vfnmsubsd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2 -; FMA4-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA4-NEXT: vmovsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0] +; FMA4-NEXT: vxorpd %xmm1, %xmm0, %xmm0 ; FMA4-NEXT: retq entry: %3 = fneg double %0 diff --git a/llvm/test/CodeGen/X86/fp-logic.ll b/llvm/test/CodeGen/X86/fp-logic.ll index 522a1589caf09..a21204cb30771 100644 --- a/llvm/test/CodeGen/X86/fp-logic.ll +++ b/llvm/test/CodeGen/X86/fp-logic.ll @@ -243,7 +243,8 @@ define float @movmsk(float %x) { define double @bitcast_fabs(double %x) { ; CHECK-LABEL: bitcast_fabs: ; CHECK: # %bb.0: -; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movsd {{.*#+}} xmm1 = [NaN,0.0E+0] +; CHECK-NEXT: andps %xmm1, %xmm0 ; CHECK-NEXT: retq %bc1 = bitcast double %x to i64 %and = and i64 %bc1, 9223372036854775807 @@ -254,7 +255,8 @@ define double @bitcast_fabs(double %x) { define float @bitcast_fneg(float %x) { ; CHECK-LABEL: bitcast_fneg: ; CHECK: # %bb.0: -; CHECK-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: xorps %xmm1, %xmm0 ; CHECK-NEXT: retq %bc1 = bitcast float %x to i32 %xor = xor i32 %bc1, 2147483648 @@ -311,7 +313,8 @@ define float @fsub_bitcast_fneg(float %x, float %y) { define float @nabsf(float %a) { ; CHECK-LABEL: nabsf: ; CHECK: # %bb.0: -; CHECK-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: orps %xmm1, %xmm0 ; CHECK-NEXT: retq %conv = bitcast float %a to i32 %and = or i32 %conv, -2147483648 @@ -322,7 +325,8 @@ define float @nabsf(float %a) { define double @nabsd(double %a) { ; CHECK-LABEL: nabsd: ; CHECK: # %bb.0: -; CHECK-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0] +; CHECK-NEXT: orps %xmm1, %xmm0 ; CHECK-NEXT: retq %conv = bitcast double %a to i64 %and = or i64 %conv, -9223372036854775808 diff --git a/llvm/test/CodeGen/X86/fp-round.ll b/llvm/test/CodeGen/X86/fp-round.ll index 58c4f71892e90..5689e59e6cb0f 100644 --- a/llvm/test/CodeGen/X86/fp-round.ll +++ b/llvm/test/CodeGen/X86/fp-round.ll @@ -24,10 +24,11 @@ define half @round_f16(half %h) { ; SSE41-NEXT: callq __extendhfsf2@PLT ; SSE41-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; SSE41-NEXT: andps %xmm0, %xmm1 -; SSE41-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = [4.9999997E-1,0.0E+0,0.0E+0,0.0E+0] +; SSE41-NEXT: orps %xmm1, %xmm2 +; SSE41-NEXT: addss %xmm0, %xmm2 ; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: roundss $11, %xmm2, %xmm0 ; SSE41-NEXT: callq __truncsfhf2@PLT ; SSE41-NEXT: popq %rax ; SSE41-NEXT: .cfi_def_cfa_offset 8 @@ -83,10 +84,11 @@ define float @round_f32(float %x) { ; SSE41: # %bb.0: ; SSE41-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; SSE41-NEXT: andps %xmm0, %xmm1 -; SSE41-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = [4.9999997E-1,0.0E+0,0.0E+0,0.0E+0] +; SSE41-NEXT: orps %xmm1, %xmm2 +; SSE41-NEXT: addss %xmm0, %xmm2 ; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: roundss $11, %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: round_f32: @@ -126,10 +128,11 @@ define double @round_f64(double %x) { ; SSE41: # %bb.0: ; SSE41-NEXT: movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] ; SSE41-NEXT: andpd %xmm0, %xmm1 -; SSE41-NEXT: orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: addsd %xmm0, %xmm1 +; SSE41-NEXT: movsd {{.*#+}} xmm2 = [4.9999999999999994E-1,0.0E+0] +; SSE41-NEXT: orpd %xmm1, %xmm2 +; SSE41-NEXT: addsd %xmm0, %xmm2 ; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: roundsd $11, %xmm1, %xmm0 +; SSE41-NEXT: roundsd $11, %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: round_f64: diff --git a/llvm/test/CodeGen/X86/fp128-cast.ll b/llvm/test/CodeGen/X86/fp128-cast.ll index 1de2484d47ba1..d16fdcdf1752d 100644 --- a/llvm/test/CodeGen/X86/fp128-cast.ll +++ b/llvm/test/CodeGen/X86/fp128-cast.ll @@ -1260,7 +1260,8 @@ define fp128 @TestTruncCopysign(fp128 %x, i32 %n) nounwind { ; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: callq __trunctfdf2@PLT ; X64-SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE-NEXT: movsd {{.*#+}} xmm1 = [+Inf,0.0E+0] +; X64-SSE-NEXT: orps %xmm1, %xmm0 ; X64-SSE-NEXT: callq __extenddftf2@PLT ; X64-SSE-NEXT: addq $8, %rsp ; X64-SSE-NEXT: .LBB26_2: # %cleanup diff --git a/llvm/test/CodeGen/X86/fp16-libcalls.ll b/llvm/test/CodeGen/X86/fp16-libcalls.ll index 3af8b1aec1feb..5a1f0da86de63 100644 --- a/llvm/test/CodeGen/X86/fp16-libcalls.ll +++ b/llvm/test/CodeGen/X86/fp16-libcalls.ll @@ -335,7 +335,8 @@ define void @test_half_fabs(half %a0, ptr %p0) nounwind { ; F16C-LABEL: test_half_fabs: ; F16C: # %bb.0: ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; F16C-NEXT: vmovss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; F16C-NEXT: vandps %xmm1, %xmm0, %xmm0 ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; F16C-NEXT: vpextrw $0, %xmm0, (%rdi) ; F16C-NEXT: retq @@ -352,7 +353,8 @@ define void @test_half_fabs(half %a0, ptr %p0) nounwind { ; X64-NEXT: pushq %rbx ; X64-NEXT: movq %rdi, %rbx ; X64-NEXT: callq __extendhfsf2@PLT -; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movd {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; X64-NEXT: pand %xmm1, %xmm0 ; X64-NEXT: callq __truncsfhf2@PLT ; X64-NEXT: pextrw $0, %xmm0, %eax ; X64-NEXT: movw %ax, (%rbx) @@ -515,7 +517,8 @@ define void @test_half_fneg(half %a0, ptr %p0) nounwind { ; F16C-LABEL: test_half_fneg: ; F16C: # %bb.0: ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; F16C-NEXT: vmovss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; F16C-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; F16C-NEXT: vpextrw $0, %xmm0, (%rdi) ; F16C-NEXT: retq @@ -532,7 +535,8 @@ define void @test_half_fneg(half %a0, ptr %p0) nounwind { ; X64-NEXT: pushq %rbx ; X64-NEXT: movq %rdi, %rbx ; X64-NEXT: callq __extendhfsf2@PLT -; X64-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; X64-NEXT: pxor %xmm1, %xmm0 ; X64-NEXT: callq __truncsfhf2@PLT ; X64-NEXT: pextrw $0, %xmm0, %eax ; X64-NEXT: movw %ax, (%rbx) diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll index 362b3b945f962..57695316386b7 100644 --- a/llvm/test/CodeGen/X86/freeze-vector.ll +++ b/llvm/test/CodeGen/X86/freeze-vector.ll @@ -311,8 +311,8 @@ define void @freeze_buildvector_single_repeated_maybe_poison_operand(ptr %origin ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %ecx ; X86-NEXT: andl $15, %ecx -; X86-NEXT: vbroadcastss {{.*#+}} xmm0 = [42,42,42,42] -; X86-NEXT: vpinsrd $0, %ecx, %xmm0, %xmm0 +; X86-NEXT: vmovd %ecx, %xmm0 +; X86-NEXT: vpinsrd $1, {{\.?LCPI[0-9]+_[0-9]+}}+4, %xmm0, %xmm0 ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-NEXT: vmovdqa %xmm0, (%eax) @@ -320,8 +320,8 @@ define void @freeze_buildvector_single_repeated_maybe_poison_operand(ptr %origin ; ; X64-LABEL: freeze_buildvector_single_repeated_maybe_poison_operand: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastd {{.*#+}} xmm0 = [42,42,42,42] -; X64-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0 +; X64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: vpinsrd $1, {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm0, %xmm0 ; X64-NEXT: vpbroadcastq %xmm0, %xmm0 ; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7] ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll index 0ca3380d188b7..2d8484fb82fae 100644 --- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll @@ -2147,7 +2147,7 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou define <64 x i8> @splatvar_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nounwind { ; GFNISSE-LABEL: splatvar_fshl_v64i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; GFNISSE-NEXT: movd {{.*#+}} xmm8 = mem[0],zero,zero,zero ; GFNISSE-NEXT: movdqa %xmm4, %xmm9 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15] ; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 @@ -2247,24 +2247,25 @@ define <64 x i8> @splatvar_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt ; ; GFNIAVX512VL-LABEL: splatvar_fshl_v64i8: ; GFNIAVX512VL: # %bb.0: -; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; GFNIAVX512VL-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; GFNIAVX512VL-NEXT: vpternlogq {{.*#+}} xmm3 = xmm3 & xmm2 & mem +; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] -; GFNIAVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; GFNIAVX512VL-NEXT: vpsllw %xmm2, %ymm5, %ymm5 +; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15],ymm4[24],ymm2[24],ymm4[25],ymm2[25],ymm4[26],ymm2[26],ymm4[27],ymm2[27],ymm4[28],ymm2[28],ymm4[29],ymm2[29],ymm4[30],ymm2[30],ymm4[31],ymm2[31] +; GFNIAVX512VL-NEXT: vpsllw %xmm3, %ymm5, %ymm5 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5 -; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] -; GFNIAVX512VL-NEXT: vpsllw %xmm2, %ymm3, %ymm3 -; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 -; GFNIAVX512VL-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 +; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[16],ymm2[16],ymm4[17],ymm2[17],ymm4[18],ymm2[18],ymm4[19],ymm2[19],ymm4[20],ymm2[20],ymm4[21],ymm2[21],ymm4[22],ymm2[22],ymm4[23],ymm2[23] +; GFNIAVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm2 +; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 +; GFNIAVX512VL-NEXT: vpackuswb %ymm5, %ymm2, %ymm2 ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; GFNIAVX512VL-NEXT: vpsllw %xmm2, %ymm4, %ymm4 +; GFNIAVX512VL-NEXT: vpsllw %xmm3, %ymm4, %ymm4 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; GFNIAVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm0 +; GFNIAVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm0 ; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 ; GFNIAVX512VL-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; GFNIAVX512VL-NEXT: retq ; ; GFNIAVX512BW-LABEL: splatvar_fshl_v64i8: @@ -2286,7 +2287,7 @@ define <64 x i8> @splatvar_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt define <64 x i8> @splatvar_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nounwind { ; GFNISSE-LABEL: splatvar_fshr_v64i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 +; GFNISSE-NEXT: movd {{.*#+}} xmm9 = mem[0],zero,zero,zero ; GFNISSE-NEXT: movdqa %xmm4, %xmm10 ; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] ; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm9 @@ -2389,25 +2390,26 @@ define <64 x i8> @splatvar_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt ; ; GFNIAVX512VL-LABEL: splatvar_fshr_v64i8: ; GFNIAVX512VL: # %bb.0: -; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; GFNIAVX512VL-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; GFNIAVX512VL-NEXT: vpternlogq {{.*#+}} xmm3 = xmm3 & xmm2 & mem +; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] -; GFNIAVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; GFNIAVX512VL-NEXT: vpsrlw %xmm2, %ymm5, %ymm5 +; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15],ymm4[24],ymm2[24],ymm4[25],ymm2[25],ymm4[26],ymm2[26],ymm4[27],ymm2[27],ymm4[28],ymm2[28],ymm4[29],ymm2[29],ymm4[30],ymm2[30],ymm4[31],ymm2[31] +; GFNIAVX512VL-NEXT: vpsrlw %xmm3, %ymm5, %ymm5 ; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; GFNIAVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 -; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] -; GFNIAVX512VL-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 -; GFNIAVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 -; GFNIAVX512VL-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 +; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[16],ymm2[16],ymm4[17],ymm2[17],ymm4[18],ymm2[18],ymm4[19],ymm2[19],ymm4[20],ymm2[20],ymm4[21],ymm2[21],ymm4[22],ymm2[22],ymm4[23],ymm2[23] +; GFNIAVX512VL-NEXT: vpsrlw %xmm3, %ymm2, %ymm2 +; GFNIAVX512VL-NEXT: vpand %ymm6, %ymm2, %ymm2 +; GFNIAVX512VL-NEXT: vpackuswb %ymm5, %ymm2, %ymm2 ; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; GFNIAVX512VL-NEXT: vpsrlw %xmm2, %ymm4, %ymm4 +; GFNIAVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm4 ; GFNIAVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 ; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; GFNIAVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 +; GFNIAVX512VL-NEXT: vpsrlw %xmm3, %ymm0, %ymm0 ; GFNIAVX512VL-NEXT: vpand %ymm6, %ymm0, %ymm0 ; GFNIAVX512VL-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 -; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; GFNIAVX512VL-NEXT: retq ; ; GFNIAVX512BW-LABEL: splatvar_fshr_v64i8: diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll index 2472e6e19c862..565c231dffd77 100644 --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -1260,23 +1260,21 @@ define <8 x half> @select(i1 %c, <8 x half> %x, <8 x half> %y) { define <8 x half> @shuffle(ptr %p) { ; CHECK-LIBCALL-LABEL: shuffle: ; CHECK-LIBCALL: # %bb.0: -; CHECK-LIBCALL-NEXT: movdqu (%rdi), %xmm0 -; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; CHECK-LIBCALL-NEXT: pinsrw $0, 8(%rdi), %xmm0 +; CHECK-LIBCALL-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: shuffle: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,4,4,4,4] -; BWON-F16C-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; BWON-F16C-NEXT: vpinsrw $0, 8(%rdi), %xmm0, %xmm0 +; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: shuffle: ; CHECK-I686: # %bb.0: ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: movdqu (%eax), %xmm0 -; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-I686-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; CHECK-I686-NEXT: pinsrw $0, 8(%eax), %xmm0 +; CHECK-I686-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; CHECK-I686-NEXT: retl %1 = load <8 x half>, ptr %p, align 8 %2 = shufflevector <8 x half> %1, <8 x half> poison, <8 x i32> diff --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll index c44945ac2d929..20aa93bf10ec2 100644 --- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll +++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll @@ -145,9 +145,9 @@ define <2 x i64> @elt0_v2i64(i64 %x) { ; ; X64-SSE2-LABEL: elt0_v2i64: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: movq %rdi, %xmm1 -; X64-SSE2-NEXT: movapd {{.*#+}} xmm0 = [u,1] -; X64-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; X64-SSE2-NEXT: movq %rdi, %xmm0 +; X64-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X64-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-SSE2-NEXT: retq ; ; X64-SSE4-LABEL: elt0_v2i64: @@ -218,28 +218,26 @@ define <4 x float> @elt1_v4f32(float %x) { define <2 x double> @elt1_v2f64(double %x) { ; X86-SSE-LABEL: elt1_v2f64: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [4.2E+1,u] +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0] ; X86-SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; X86-SSE-NEXT: retl ; ; X64-SSE-LABEL: elt1_v2f64: ; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movaps {{.*#+}} xmm1 = [4.2E+1,u] +; X64-SSE-NEXT: movsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0] ; X64-SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; X64-SSE-NEXT: movaps %xmm1, %xmm0 ; X64-SSE-NEXT: retq ; ; X86-AVX-LABEL: elt1_v2f64: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovddup {{.*#+}} xmm0 = [4.2E+1,4.2E+1] -; X86-AVX-NEXT: # xmm0 = mem[0,0] +; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0] ; X86-AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; X86-AVX-NEXT: retl ; ; X64-AVX-LABEL: elt1_v2f64: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4.2E+1,4.2E+1] -; X64-AVX-NEXT: # xmm1 = mem[0,0] +; X64-AVX-NEXT: vmovsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0] ; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; X64-AVX-NEXT: retq %ins = insertelement <2 x double> , double %x, i32 1 @@ -384,7 +382,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) { ; X64-SSE2-LABEL: elt5_v8i64: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movq %rdi, %xmm0 -; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4,u] +; X64-SSE2-NEXT: movq {{.*#+}} xmm2 = [4,0] ; X64-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; X64-SSE2-NEXT: movaps {{.*#+}} xmm0 = [42,1] ; X64-SSE2-NEXT: movaps {{.*#+}} xmm1 = [2,3] @@ -457,7 +455,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) { define <8 x double> @elt1_v8f64(double %x) { ; X86-SSE-LABEL: elt1_v8f64: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [4.2E+1,u] +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0] ; X86-SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; X86-SSE-NEXT: movaps {{.*#+}} xmm1 = [2.0E+0,3.0E+0] ; X86-SSE-NEXT: movaps {{.*#+}} xmm2 = [4.0E+0,5.0E+0] @@ -466,7 +464,7 @@ define <8 x double> @elt1_v8f64(double %x) { ; ; X64-SSE-LABEL: elt1_v8f64: ; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movaps {{.*#+}} xmm4 = [4.2E+1,u] +; X64-SSE-NEXT: movsd {{.*#+}} xmm4 = [4.2E+1,0.0E+0] ; X64-SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; X64-SSE-NEXT: movaps {{.*#+}} xmm1 = [2.0E+0,3.0E+0] ; X64-SSE-NEXT: movaps {{.*#+}} xmm2 = [4.0E+0,5.0E+0] @@ -476,47 +474,49 @@ define <8 x double> @elt1_v8f64(double %x) { ; ; X86-AVX1-LABEL: elt1_v8f64: ; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [4.2E+1,u,2.0E+0,3.0E+0] -; X86-AVX1-NEXT: vmovhps {{.*#+}} xmm1 = xmm0[0,1],mem[0,1] -; X86-AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0] +; X86-AVX1-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; X86-AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; X86-AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; X86-AVX1-NEXT: retl ; ; X64-AVX1-LABEL: elt1_v8f64: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [4.2E+1,u,2.0E+0,3.0E+0] +; X64-AVX1-NEXT: vmovsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0] ; X64-AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; X64-AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; X64-AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; X64-AVX1-NEXT: retq ; ; X86-AVX2-LABEL: elt1_v8f64: ; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [4.2E+1,u,2.0E+0,3.0E+0] -; X86-AVX2-NEXT: vmovhps {{.*#+}} xmm1 = xmm0[0,1],mem[0,1] -; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0] +; X86-AVX2-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; X86-AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; X86-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: elt1_v8f64: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [4.2E+1,u,2.0E+0,3.0E+0] +; X64-AVX2-NEXT: vmovsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0] ; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; X64-AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; X64-AVX2-NEXT: retq ; ; X86-AVX512F-LABEL: elt1_v8f64: ; X86-AVX512F: # %bb.0: -; X86-AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] -; X86-AVX512F-NEXT: vmovhps {{.*#+}} xmm1 = xmm0[0,1],mem[0,1] -; X86-AVX512F-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0 +; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0] +; X86-AVX512F-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; X86-AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] +; X86-AVX512F-NEXT: vinsertf32x4 $0, %xmm0, %zmm1, %zmm0 ; X86-AVX512F-NEXT: retl ; ; X64-AVX512F-LABEL: elt1_v8f64: ; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] +; X64-AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0] ; X64-AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; X64-AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0] ; X64-AVX512F-NEXT: vinsertf32x4 $0, %xmm0, %zmm1, %zmm0 ; X64-AVX512F-NEXT: retq %ins = insertelement <8 x double> , double %x, i32 1 diff --git a/llvm/test/CodeGen/X86/insertps-combine.ll b/llvm/test/CodeGen/X86/insertps-combine.ll index 18edc83b7edcf..ee7af27b0ac70 100644 --- a/llvm/test/CodeGen/X86/insertps-combine.ll +++ b/llvm/test/CodeGen/X86/insertps-combine.ll @@ -269,12 +269,12 @@ define float @extract_zero_insertps_z0z7(<4 x float> %a0, <4 x float> %a1) { define float @extract_lane_insertps_5123(<4 x float> %a0, ptr%p1) { ; SSE-LABEL: extract_lane_insertps_5123: ; SSE: # %bb.0: -; SSE-NEXT: movshdup (%rdi), %xmm0 # xmm0 = mem[1,1,3,3] +; SSE-NEXT: movss 4(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: retq ; ; AVX-LABEL: extract_lane_insertps_5123: ; AVX: # %bb.0: -; AVX-NEXT: vmovshdup (%rdi), %xmm0 # xmm0 = mem[1,1,3,3] +; AVX-NEXT: vmovss 4(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: retq %a1 = load <4 x float>, ptr%p1 %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 64) @@ -285,13 +285,12 @@ define float @extract_lane_insertps_5123(<4 x float> %a0, ptr%p1) { define float @extract_lane_insertps_6123(<4 x float> %a0, ptr%p1) { ; SSE-LABEL: extract_lane_insertps_6123: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movss 8(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: retq ; ; AVX-LABEL: extract_lane_insertps_6123: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd $1, (%rdi), %xmm0 # xmm0 = mem[1,0] +; AVX-NEXT: vmovss 8(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: retq %a1 = load <4 x float>, ptr%p1 %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 128) @@ -308,7 +307,8 @@ define <4 x float> @commute_load_insertps(<4 x float>, ptr nocapture readonly) { ; ; AVX-LABEL: commute_load_insertps: ; AVX: # %bb.0: -; AVX-NEXT: vinsertps $53, 12(%rdi), %xmm0, %xmm0 # xmm0 = zero,xmm0[1],zero,mem[0] +; AVX-NEXT: vbroadcastss 12(%rdi), %xmm1 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[1],zero,xmm1[3] ; AVX-NEXT: retq %3 = load <4 x float>, ptr %1 %4 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %3, <4 x float> %0, i8 85) diff --git a/llvm/test/CodeGen/X86/insertps-from-constantpool.ll b/llvm/test/CodeGen/X86/insertps-from-constantpool.ll index f03df634dc1de..99ed327c36c3e 100644 --- a/llvm/test/CodeGen/X86/insertps-from-constantpool.ll +++ b/llvm/test/CodeGen/X86/insertps-from-constantpool.ll @@ -7,12 +7,14 @@ define <4 x float> @fold_from_constantpool(<4 x float> %a) { ; X86-LABEL: fold_from_constantpool: ; X86: # %bb.0: -; X86-NEXT: insertps $0, {{\.?LCPI[0-9]+_[0-9]+}}+4, %xmm0 # xmm0 = mem[0],xmm0[1,2,3] +; X86-NEXT: movss {{\.?LCPI[0-9]+_[0-9]+}}+4, %xmm1 # xmm1 = mem[0],zero,zero,zero +; X86-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; X86-NEXT: retl ; ; X64-LABEL: fold_from_constantpool: ; X64: # %bb.0: -; X64-NEXT: insertps $0, {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm0 # xmm0 = mem[0],xmm0[1,2,3] +; X64-NEXT: movss {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm1 # xmm1 = mem[0],zero,zero,zero +; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; X64-NEXT: retq %1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> , i8 64) ret <4 x float> %1 diff --git a/llvm/test/CodeGen/X86/insertps-unfold-load-bug.ll b/llvm/test/CodeGen/X86/insertps-unfold-load-bug.ll index 93b60c27255f3..29737b3acf55e 100644 --- a/llvm/test/CodeGen/X86/insertps-unfold-load-bug.ll +++ b/llvm/test/CodeGen/X86/insertps-unfold-load-bug.ll @@ -11,7 +11,7 @@ define <4 x float> @insertps_unfold(ptr %v0, ptr %v1) { ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: movaps (%eax), %xmm0 -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; X32-NEXT: addps %xmm1, %xmm0 ; X32-NEXT: retl ; @@ -19,7 +19,7 @@ define <4 x float> @insertps_unfold(ptr %v0, ptr %v1) { ; X64: # %bb.0: ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-NEXT: movaps (%rdi), %xmm0 -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; X64-NEXT: addps %xmm1, %xmm0 ; X64-NEXT: retq %a = getelementptr inbounds <4 x float>, ptr %v1, i64 0, i64 1 diff --git a/llvm/test/CodeGen/X86/is_fpclass.ll b/llvm/test/CodeGen/X86/is_fpclass.ll index 97136dafa6c2c..198d1a08a9342 100644 --- a/llvm/test/CodeGen/X86/is_fpclass.ll +++ b/llvm/test/CodeGen/X86/is_fpclass.ll @@ -2595,8 +2595,9 @@ define i1 @issubnormal_or_zero_or_nan_f(float %x) { ; ; X64-LABEL: issubnormal_or_zero_or_nan_f: ; X64: # %bb.0: -; X64-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; X64-NEXT: andps %xmm0, %xmm1 +; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; X64-NEXT: setb %al ; X64-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 243) ; 0xf0|0x3 = "subnormal|zero|nan" @@ -2764,8 +2765,9 @@ define i1 @not_issubnormal_or_zero_or_nan_f(float %x) { ; ; X64-LABEL: not_issubnormal_or_zero_or_nan_f: ; X64: # %bb.0: -; X64-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; X64-NEXT: andps %xmm0, %xmm1 +; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; X64-NEXT: setae %al ; X64-NEXT: retq %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 780) ; ~(0xf0|0x3) = ~"subnormal|zero|nan" diff --git a/llvm/test/CodeGen/X86/isel-blendi-gettargetconstant.ll b/llvm/test/CodeGen/X86/isel-blendi-gettargetconstant.ll index fb7efc2200c67..ad8878a6f83b7 100644 --- a/llvm/test/CodeGen/X86/isel-blendi-gettargetconstant.ll +++ b/llvm/test/CodeGen/X86/isel-blendi-gettargetconstant.ll @@ -5,8 +5,11 @@ define void @csrot_(ptr %0) { ; CHECK-LABEL: csrot_: ; CHECK: # %bb.0: ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] +; CHECK-NEXT: movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: xorps %xmm0, %xmm1 +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] +; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; CHECK-NEXT: movlps %xmm0, (%rax) ; CHECK-NEXT: retq 1: diff --git a/llvm/test/CodeGen/X86/load-partial.ll b/llvm/test/CodeGen/X86/load-partial.ll index dba63582ff08b..3a1cfcb9c9a6f 100644 --- a/llvm/test/CodeGen/X86/load-partial.ll +++ b/llvm/test/CodeGen/X86/load-partial.ll @@ -211,13 +211,11 @@ define <4 x float> @load_float4_float3_trunc_0123(ptr nocapture readonly derefer ; SSE2-LABEL: load_float4_float3_trunc_0123: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps (%rdi), %xmm0 -; SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_float4_float3_trunc_0123: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movaps (%rdi), %xmm0 -; SSSE3-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_float4_float3_trunc_0123: @@ -257,13 +255,11 @@ define <4 x float> @load_float4_float3_trunc_0123_unaligned(ptr nocapture readon ; SSE2-LABEL: load_float4_float3_trunc_0123_unaligned: ; SSE2: # %bb.0: ; SSE2-NEXT: movups (%rdi), %xmm0 -; SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_float4_float3_trunc_0123_unaligned: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movups (%rdi), %xmm0 -; SSSE3-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_float4_float3_trunc_0123_unaligned: diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll index 89459a2d10177..7cba05d61a1c3 100644 --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -6791,7 +6791,8 @@ define <8 x double> @mload_constmask_v8f64(ptr %addr, <8 x double> %dst) { ; AVX1OR2-LABEL: mload_constmask_v8f64: ; AVX1OR2: ## %bb.0: ; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6,7] -; AVX1OR2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] +; AVX1OR2-NEXT: vbroadcastsd 56(%rdi), %ymm2 +; AVX1OR2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: mload_constmask_v8f64: diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll index c7ec5e87dcc6b..1a8200c322973 100644 --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -4701,13 +4701,13 @@ define void @mstore_constmask_allones_split(<16 x i64> %trigger, ptr %addr, <16 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 ; SSE2-NEXT: movq %xmm5, (%rdi) ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] ; SSE2-NEXT: movq %xmm5, 8(%rdi) -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = mem[2,3,2,3] -; SSE2-NEXT: movq %xmm5, 24(%rdi) -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: movq %rcx, 24(%rdi) ; SSE2-NEXT: movq %rax, 32(%rdi) ; SSE2-NEXT: movq %xmm4, 48(%rdi) ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] @@ -4733,11 +4733,10 @@ define void @mstore_constmask_allones_split(<16 x i64> %trigger, ptr %addr, <16 ; SSE4-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ; SSE4-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 ; SSE4-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 -; SSE4-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 -; SSE4-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 -; SSE4-NEXT: movups %xmm6, (%rdi) -; SSE4-NEXT: palignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7] -; SSE4-NEXT: movdqu %xmm5, 24(%rdi) +; SSE4-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; SSE4-NEXT: movups %xmm5, (%rdi) +; SSE4-NEXT: movups {{[0-9]+}}(%rsp), %xmm5 +; SSE4-NEXT: movups %xmm5, 24(%rdi) ; SSE4-NEXT: movups %xmm4, 48(%rdi) ; SSE4-NEXT: movups %xmm3, 64(%rdi) ; SSE4-NEXT: movups %xmm2, 80(%rdi) diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll index 73d459ba77026..751805db38ec0 100644 --- a/llvm/test/CodeGen/X86/mmx-arith.ll +++ b/llvm/test/CodeGen/X86/mmx-arith.ll @@ -325,14 +325,11 @@ define void @test2(ptr %A, ptr %B) nounwind { ; X64-NEXT: movq %mm0, %rax ; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: andq (%rsi), %rax -; X64-NEXT: movq %rax, %xmm0 ; X64-NEXT: movq %rax, (%rdi) -; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: por %xmm0, %xmm1 -; X64-NEXT: movq %xmm1, (%rdi) -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pxor %xmm1, %xmm0 -; X64-NEXT: movq %xmm0, (%rdi) +; X64-NEXT: orq (%rsi), %rax +; X64-NEXT: movq %rax, (%rdi) +; X64-NEXT: xorq (%rsi), %rax +; X64-NEXT: movq %rax, (%rdi) ; X64-NEXT: emms ; X64-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/neg_fp.ll b/llvm/test/CodeGen/X86/neg_fp.ll index 8020982509819..84bbf18dde36a 100644 --- a/llvm/test/CodeGen/X86/neg_fp.ll +++ b/llvm/test/CodeGen/X86/neg_fp.ll @@ -10,8 +10,9 @@ define float @negfp(float %a, float %b) nounwind { ; CHECK-NEXT: pushl %eax ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: subss {{[0-9]+}}(%esp), %xmm0 -; CHECK-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; CHECK-NEXT: movss %xmm0, (%esp) +; CHECK-NEXT: movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: xorps %xmm0, %xmm1 +; CHECK-NEXT: movss %xmm1, (%esp) ; CHECK-NEXT: flds (%esp) ; CHECK-NEXT: popl %eax ; CHECK-NEXT: retl diff --git a/llvm/test/CodeGen/X86/negative-sin.ll b/llvm/test/CodeGen/X86/negative-sin.ll index f24507d3a4f38..3368dde860875 100644 --- a/llvm/test/CodeGen/X86/negative-sin.ll +++ b/llvm/test/CodeGen/X86/negative-sin.ll @@ -56,7 +56,8 @@ define double @semi_strict1(double %e) nounwind { ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vsubsd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: callq sin@PLT -; CHECK-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0] +; CHECK-NEXT: vxorpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq %f = fsub double 0.0, %e diff --git a/llvm/test/CodeGen/X86/packus.ll b/llvm/test/CodeGen/X86/packus.ll index 384e40496d82a..ce82ad7857dda 100644 --- a/llvm/test/CodeGen/X86/packus.ll +++ b/llvm/test/CodeGen/X86/packus.ll @@ -118,25 +118,45 @@ define <8 x i16> @trunc_lshr_v8i32(<8 x i32> %a) nounwind { } define <8 x i16> @trunc_lshr_v4i64_demandedelts(<4 x i64> %a0) { -; SSE2-LABEL: trunc_lshr_v4i64_demandedelts: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: ret{{[l|q]}} +; X86-SSE2-LABEL: trunc_lshr_v4i64_demandedelts: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movd {{.*#+}} xmm2 = [1,0,0,0] +; X86-SSE2-NEXT: pand %xmm0, %xmm2 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,0,0] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: packuswb %xmm1, %xmm0 +; X86-SSE2-NEXT: retl ; -; SSE4-LABEL: trunc_lshr_v4i64_demandedelts: -; SSE4: # %bb.0: -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE4-NEXT: pmovsxbd {{.*#+}} xmm2 = [1,1,1,1] -; SSE4-NEXT: pand %xmm2, %xmm1 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE4-NEXT: pand %xmm2, %xmm0 -; SSE4-NEXT: packusdw %xmm1, %xmm0 -; SSE4-NEXT: ret{{[l|q]}} +; X64-SSE2-LABEL: trunc_lshr_v4i64_demandedelts: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movq {{.*#+}} xmm2 = [1,0] +; X64-SSE2-NEXT: pand %xmm0, %xmm2 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,0,0] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE2-NEXT: packuswb %xmm1, %xmm0 +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: trunc_lshr_v4i64_demandedelts: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: movd {{.*#+}} xmm2 = [1,0,0,0] +; X86-SSE4-NEXT: pand %xmm0, %xmm2 +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,0,0] +; X86-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X86-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE4-NEXT: packusdw %xmm1, %xmm0 +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: trunc_lshr_v4i64_demandedelts: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: movq {{.*#+}} xmm2 = [1,0] +; X64-SSE4-NEXT: pand %xmm0, %xmm2 +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,0,0] +; X64-SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X64-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-SSE4-NEXT: packusdw %xmm1, %xmm0 +; X64-SSE4-NEXT: retq ; ; X86-AVX1-LABEL: trunc_lshr_v4i64_demandedelts: ; X86-AVX1: # %bb.0: @@ -447,8 +467,4 @@ define <32 x i8> @packuswb_icmp_zero_trunc_256(<16 x i16> %a0) { } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; X64-AVX2: {{.*}} -; X64-SSE2: {{.*}} -; X64-SSE4: {{.*}} ; X86-AVX2: {{.*}} -; X86-SSE2: {{.*}} -; X86-SSE4: {{.*}} diff --git a/llvm/test/CodeGen/X86/peephole-fold-movsd.ll b/llvm/test/CodeGen/X86/peephole-fold-movsd.ll index c0a6e00ec695e..46d65e6f375e5 100644 --- a/llvm/test/CodeGen/X86/peephole-fold-movsd.ll +++ b/llvm/test/CodeGen/X86/peephole-fold-movsd.ll @@ -18,7 +18,7 @@ define dso_local void @foo1(double %a.coerce0, double %a.coerce1, double %b.coer ; CHECK-NEXT: movq %rsp, %rdi ; CHECK-NEXT: callq foo3@PLT ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: movapd {{.*#+}} xmm1 = [1.0E+0,u] +; CHECK-NEXT: movsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0] ; CHECK-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] ; CHECK-NEXT: addpd %xmm0, %xmm1 ; CHECK-NEXT: movapd %xmm1, g(%rip) diff --git a/llvm/test/CodeGen/X86/pr14161.ll b/llvm/test/CodeGen/X86/pr14161.ll index cdf3757e05b20..a38ad03117855 100644 --- a/llvm/test/CodeGen/X86/pr14161.ll +++ b/llvm/test/CodeGen/X86/pr14161.ll @@ -24,7 +24,8 @@ entry: define <2 x i16> @bad(ptr, ptr) { ; CHECK-LABEL: bad: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = mem[1,1,1,1] +; CHECK-NEXT: pinsrd $1, 4(%rdi), %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: pminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/pr30511.ll b/llvm/test/CodeGen/X86/pr30511.ll index 088f3bfef8542..0a4428d0a74e5 100644 --- a/llvm/test/CodeGen/X86/pr30511.ll +++ b/llvm/test/CodeGen/X86/pr30511.ll @@ -7,8 +7,9 @@ target triple = "x86_64-pc-linux-gnu" define i64 @PR30511(<2 x double> %a) { ; CHECK-LABEL: PR30511: ; CHECK: # %bb.0: -; CHECK-NEXT: addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cvtdq2pd %xmm0, %xmm0 +; CHECK-NEXT: movsd {{.*#+}} xmm1 = [6.755399441055744E+15,0.0E+0] +; CHECK-NEXT: addpd %xmm0, %xmm1 +; CHECK-NEXT: cvtdq2pd %xmm1, %xmm0 ; CHECK-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr31956.ll b/llvm/test/CodeGen/X86/pr31956.ll index 38b55a5c32a61..692cdaff33fc1 100644 --- a/llvm/test/CodeGen/X86/pr31956.ll +++ b/llvm/test/CodeGen/X86/pr31956.ll @@ -9,10 +9,11 @@ target triple = "x86_64-scei-ps4" define <4 x float> @foo() { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovaps G2(%rip), %xmm0 -; CHECK-NEXT: vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],mem[0,2] -; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,3,1] +; CHECK-NEXT: vbroadcastss G2+16(%rip), %xmm0 +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],zero,zero +; CHECK-NEXT: vbroadcastss G2+24(%rip), %xmm1 +; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; CHECK-NEXT: retq entry: %V = load <2 x float>, ptr @G1, align 8 diff --git a/llvm/test/CodeGen/X86/pr34592.ll b/llvm/test/CodeGen/X86/pr34592.ll index aed5ea3ed217b..517d93f487883 100644 --- a/llvm/test/CodeGen/X86/pr34592.ll +++ b/llvm/test/CodeGen/X86/pr34592.ll @@ -52,24 +52,24 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1 ; CHECK-O3-NEXT: movq %rsp, %rbp ; CHECK-O3-NEXT: andq $-32, %rsp ; CHECK-O3-NEXT: subq $32, %rsp -; CHECK-O3-NEXT: vmovdqa 208(%rbp), %ymm3 -; CHECK-O3-NEXT: vmovdqa 144(%rbp), %ymm0 -; CHECK-O3-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm2[6,7] -; CHECK-O3-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; CHECK-O3-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; CHECK-O3-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,1,2,1] -; CHECK-O3-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] -; CHECK-O3-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm7[2,3],ymm6[0,1] -; CHECK-O3-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-O3-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; CHECK-O3-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] -; CHECK-O3-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; CHECK-O3-NEXT: vpbroadcastq 248(%rbp), %ymm4 -; CHECK-O3-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5],ymm1[6,7] -; CHECK-O3-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3] -; CHECK-O3-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] -; CHECK-O3-NEXT: vpslldq {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20,21,22,23] -; CHECK-O3-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7] +; CHECK-O3-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-O3-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; CHECK-O3-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-O3-NEXT: vbroadcastsd 160(%rbp), %ymm3 +; CHECK-O3-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; CHECK-O3-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; CHECK-O3-NEXT: vbroadcastsd 216(%rbp), %ymm4 +; CHECK-O3-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; CHECK-O3-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; CHECK-O3-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,1] +; CHECK-O3-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; CHECK-O3-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; CHECK-O3-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3],ymm6[0,1] +; CHECK-O3-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; CHECK-O3-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],ymm5[0],ymm7[2],ymm5[2] +; CHECK-O3-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] +; CHECK-O3-NEXT: vbroadcastsd 248(%rbp), %ymm4 +; CHECK-O3-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5],ymm1[6,7] ; CHECK-O3-NEXT: movq %rbp, %rsp ; CHECK-O3-NEXT: popq %rbp ; CHECK-O3-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr36553.ll b/llvm/test/CodeGen/X86/pr36553.ll index b61ec81473081..17649d43352d2 100644 --- a/llvm/test/CodeGen/X86/pr36553.ll +++ b/llvm/test/CodeGen/X86/pr36553.ll @@ -8,7 +8,8 @@ define float @pr36553(float %a, float %b, float %c) nounwind { ; CHECK: ## %bb.0: ## %entry ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: callq _fmaf -; CHECK-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: xorps %xmm1, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/pr40811.ll b/llvm/test/CodeGen/X86/pr40811.ll index 7851856713e82..63bfbcec1e1da 100644 --- a/llvm/test/CodeGen/X86/pr40811.ll +++ b/llvm/test/CodeGen/X86/pr40811.ll @@ -4,10 +4,11 @@ define <8 x i32> @_Z6test70v(ptr %id14793) { ; CHECK-LABEL: _Z6test70v: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovaps (%rdi), %xmm0 -; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2,3] -; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,1,0] -; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2,1,0] +; CHECK-NEXT: vmovdqa (%rdi), %xmm0 +; CHECK-NEXT: vpinsrd $1, {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm0, %xmm1 +; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,1,0] +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,0] ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/pr63091.ll b/llvm/test/CodeGen/X86/pr63091.ll index 3f50be8ab8df9..9f4700e94df68 100644 --- a/llvm/test/CodeGen/X86/pr63091.ll +++ b/llvm/test/CodeGen/X86/pr63091.ll @@ -35,9 +35,10 @@ define <4 x i32> @dont_merge_pcmpgt(<16 x i8> %0, <4 x i32> %1) { define <4 x i32> @merge_and(<16 x i8> %0, <4 x i32> %1) { ; SSE-LABEL: merge_and: ; SSE: # %bb.0: -; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: pinsrd $3, {{\.?LCPI[0-9]+_[0-9]+}}+12(%rip), %xmm2 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6,7] ; SSE-NEXT: retq ; ; AVX2-LABEL: merge_and: diff --git a/llvm/test/CodeGen/X86/sar_fold64.ll b/llvm/test/CodeGen/X86/sar_fold64.ll index 245af74c23891..234dbbd620d51 100644 --- a/llvm/test/CodeGen/X86/sar_fold64.ll +++ b/llvm/test/CodeGen/X86/sar_fold64.ll @@ -99,16 +99,18 @@ define <4 x i32> @all_sign_bit_ashr_vec0(<4 x i32> %x) { define <4 x i32> @all_sign_bit_ashr_vec1(<4 x i32> %x) { ; SSE-LABEL: all_sign_bit_ashr_vec1: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: movd {{.*#+}} xmm1 = [1,0,0,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: pxor %xmm0, %xmm0 ; SSE-NEXT: psubd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: all_sign_bit_ashr_vec1: ; AVX1: # %bb.0: +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = [1,0,0,0] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq @@ -159,16 +161,18 @@ define <4 x i32> @all_sign_bit_ashr_vec2(<4 x i32> %x) { define <4 x i32> @all_sign_bit_ashr_vec3(<4 x i32> %x) { ; SSE-LABEL: all_sign_bit_ashr_vec3: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: movd {{.*#+}} xmm1 = [1,0,0,0] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE-NEXT: pcmpeqd %xmm0, %xmm0 ; SSE-NEXT: paddd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: all_sign_bit_ashr_vec3: ; AVX1: # %bb.0: +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = [1,0,0,0] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/setcc-combine.ll b/llvm/test/CodeGen/X86/setcc-combine.ll index e723569bda8a1..193f2df95a045 100644 --- a/llvm/test/CodeGen/X86/setcc-combine.ll +++ b/llvm/test/CodeGen/X86/setcc-combine.ll @@ -352,7 +352,7 @@ define i64 @sub_constant_to_shift_to_add(i32 %x, i64 %s1, i64 %s2) { define float @olt(float %x) { ; CHECK-LABEL: olt: ; CHECK: # %bb.0: -; CHECK-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0] ; CHECK-NEXT: xorps %xmm0, %xmm1 ; CHECK-NEXT: minss %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -365,7 +365,7 @@ define float @olt(float %x) { define double @ogt(double %x) { ; CHECK-LABEL: ogt: ; CHECK: # %bb.0: -; CHECK-NEXT: movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] +; CHECK-NEXT: movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0] ; CHECK-NEXT: xorpd %xmm0, %xmm1 ; CHECK-NEXT: maxsd %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -481,7 +481,7 @@ define double @ogt_no_fneg(double %x, double %y) { define double @ogt_no_zero(double %x) { ; CHECK-LABEL: ogt_no_zero: ; CHECK: # %bb.0: -; CHECK-NEXT: movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] +; CHECK-NEXT: movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0] ; CHECK-NEXT: xorpd %xmm0, %xmm1 ; CHECK-NEXT: movsd {{.*#+}} xmm2 = [1.0E+0,0.0E+0] ; CHECK-NEXT: cmpltsd %xmm0, %xmm2 diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll index 2ac2be5545dfd..a69f13839e53f 100644 --- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll +++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll @@ -119,22 +119,18 @@ define void @failing(ptr %0, ptr %1) nounwind { ; CHECK-AVX2-NEXT: .LBB0_2: # %vector.body ; CHECK-AVX2-NEXT: # Parent Loop BB0_1 Depth=1 ; CHECK-AVX2-NEXT: # => This Inner Loop Header: Depth=2 -; CHECK-AVX2-NEXT: vmovdqu 1024(%rdx,%rsi), %xmm5 -; CHECK-AVX2-NEXT: vmovdqu 1040(%rdx,%rsi), %xmm6 -; CHECK-AVX2-NEXT: vpextrq $1, %xmm5, %rdi -; CHECK-AVX2-NEXT: vpextrq $1, %xmm6, %r8 -; CHECK-AVX2-NEXT: vmovq %xmm5, %r9 -; CHECK-AVX2-NEXT: vmovq %xmm6, %r10 -; CHECK-AVX2-NEXT: negq %r10 -; CHECK-AVX2-NEXT: movq %rcx, %r10 -; CHECK-AVX2-NEXT: sbbq %r8, %r10 -; CHECK-AVX2-NEXT: setge %r8b -; CHECK-AVX2-NEXT: movzbl %r8b, %r8d +; CHECK-AVX2-NEXT: movq 1040(%rdx,%rsi), %rdi +; CHECK-AVX2-NEXT: movq 1024(%rdx,%rsi), %r8 +; CHECK-AVX2-NEXT: negq %rdi +; CHECK-AVX2-NEXT: movq %rcx, %rdi +; CHECK-AVX2-NEXT: sbbq 1048(%rdx,%rsi), %rdi +; CHECK-AVX2-NEXT: setge %dil +; CHECK-AVX2-NEXT: movzbl %dil, %edi +; CHECK-AVX2-NEXT: negq %rdi +; CHECK-AVX2-NEXT: vmovq %rdi, %xmm5 ; CHECK-AVX2-NEXT: negq %r8 -; CHECK-AVX2-NEXT: vmovq %r8, %xmm5 -; CHECK-AVX2-NEXT: negq %r9 -; CHECK-AVX2-NEXT: movq %rcx, %r8 -; CHECK-AVX2-NEXT: sbbq %rdi, %r8 +; CHECK-AVX2-NEXT: movq %rcx, %rdi +; CHECK-AVX2-NEXT: sbbq 1032(%rdx,%rsi), %rdi ; CHECK-AVX2-NEXT: setge %dil ; CHECK-AVX2-NEXT: movzbl %dil, %edi ; CHECK-AVX2-NEXT: negq %rdi diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll index e53eed4587797..e83151f3eaa1e 100644 --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -1864,9 +1864,10 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) { ; X86-SSE-NEXT: movl c, %edx ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: psllq $32, %xmm0 -; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: movd {{.*#+}} xmm1 = [65536,0,0,0] +; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X86-SSE-NEXT: psllq $32, %xmm1 +; X86-SSE-NEXT: movq %xmm1, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi16_varconst3: @@ -1885,9 +1886,10 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) { ; X64-SSE-NEXT: movq c(%rip), %rax ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE-NEXT: psllq $32, %xmm0 -; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: movd {{.*#+}} xmm1 = [65536,0,0,0] +; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X64-SSE-NEXT: psllq $32, %xmm1 +; X64-SSE-NEXT: movq %xmm1, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi16_varconst3: @@ -1922,9 +1924,10 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) { ; X86-SSE-NEXT: movl c, %edx ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: psrad $16, %xmm0 -; X86-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-SSE-NEXT: psllq $32, %xmm0 -; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: movd {{.*#+}} xmm1 = [32768,0,0,0] +; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X86-SSE-NEXT: psllq $32, %xmm1 +; X86-SSE-NEXT: movq %xmm1, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi16_varconst4: @@ -1943,9 +1946,10 @@ define void @mul_2xi16_varconst4(ptr nocapture readonly %a, i64 %index) { ; X64-SSE-NEXT: movq c(%rip), %rax ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-SSE-NEXT: psrad $16, %xmm0 -; X64-SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-SSE-NEXT: psllq $32, %xmm0 -; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: movd {{.*#+}} xmm1 = [32768,0,0,0] +; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X64-SSE-NEXT: psllq $32, %xmm1 +; X64-SSE-NEXT: movq %xmm1, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: mul_2xi16_varconst4: diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll index ec442c185706c..3e880589566cc 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -384,22 +384,26 @@ define <32 x i8> @trunc_shuffle_v32i16_v32i8_ofs1(<32 x i16> %a0) { define <4 x double> @PR34175(ptr %p) { ; AVX512F-LABEL: PR34175: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqu (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512F-NEXT: vpinsrw $0, 48(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrw $0, 32(%rdi), %xmm0, %xmm1 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm1 +; AVX512F-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm2 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: PR34175: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqu (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-NEXT: vpinsrw $0, 48(%rdi), %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrw $0, 32(%rdi), %xmm0, %xmm1 +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512VL-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm1 +; AVX512VL-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm2 +; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX512VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/splat-for-size.ll b/llvm/test/CodeGen/X86/splat-for-size.ll index 76a95e2504570..459f128472ef0 100644 --- a/llvm/test/CodeGen/X86/splat-for-size.ll +++ b/llvm/test/CodeGen/X86/splat-for-size.ll @@ -388,13 +388,13 @@ define <8 x i64> @pr23259() #1 { ; AVX-LABEL: pr23259: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1] -; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm1[2,3] +; AVX-NEXT: vpinsrq $0, A+16(%rip), %xmm1, %xmm0 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX-NEXT: retq ; ; AVX2-LABEL: pr23259: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovaps A+16(%rip), %xmm0 +; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3,4,5,6,7] ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1] ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll index 2b78a70ebcc26..fad383c7b46b0 100644 --- a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll @@ -20,7 +20,8 @@ define float @f32_no_daz(float %f) #0 { ; NHM-NEXT: mulss %xmm1, %xmm2 ; NHM-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; NHM-NEXT: mulss %xmm3, %xmm2 -; NHM-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; NHM-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; NHM-NEXT: andps %xmm1, %xmm0 ; NHM-NEXT: cmpltss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; NHM-NEXT: andnps %xmm2, %xmm0 ; NHM-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-tunecpu-attr.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-tunecpu-attr.ll index 85f7733e671a7..31de79ea0fe64 100644 --- a/llvm/test/CodeGen/X86/sqrt-fastmath-tunecpu-attr.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath-tunecpu-attr.ll @@ -12,7 +12,8 @@ define float @f32_tune_nhm(float %f) #0 { ; CHECK-NEXT: mulss %xmm1, %xmm2 ; CHECK-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; CHECK-NEXT: mulss %xmm3, %xmm2 -; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: andps %xmm1, %xmm0 ; CHECK-NEXT: cmpltss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: andnps %xmm2, %xmm0 ; CHECK-NEXT: retq @@ -49,7 +50,8 @@ define float @f32_tune_x86_64(float %f) #3 { ; CHECK-NEXT: mulss %xmm1, %xmm2 ; CHECK-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; CHECK-NEXT: mulss %xmm3, %xmm2 -; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: andps %xmm1, %xmm0 ; CHECK-NEXT: cmpltss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: andnps %xmm2, %xmm0 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll index 384f8b832afb9..0b304136ccfea 100644 --- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll @@ -764,16 +764,18 @@ define <4 x float> @div_sqrt_fabs_v4f32_fmf(<4 x float> %x, <4 x float> %y, <4 x define double @div_sqrt_fabs_f64(double %x, double %y, double %z) { ; SSE-LABEL: div_sqrt_fabs_f64: ; SSE: # %bb.0: -; SSE-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE-NEXT: sqrtsd %xmm2, %xmm2 -; SSE-NEXT: mulsd %xmm2, %xmm1 -; SSE-NEXT: divsd %xmm1, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm3 = [NaN,0.0E+0] +; SSE-NEXT: andpd %xmm1, %xmm3 +; SSE-NEXT: mulsd %xmm2, %xmm3 +; SSE-NEXT: divsd %xmm3, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: div_sqrt_fabs_f64: ; AVX: # %bb.0: -; AVX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vsqrtsd %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vmovsd {{.*#+}} xmm3 = [NaN,0.0E+0] +; AVX-NEXT: vandpd %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vmulsd %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll index 08d9183bd30b6..d794340d14701 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -1014,7 +1014,8 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_INT_MIN: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647] +; CHECK-SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; CHECK-SSE2-NEXT: pand %xmm0, %xmm2 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] @@ -1036,7 +1037,7 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE41-LABEL: test_srem_odd_INT_MIN: ; CHECK-SSE41: # %bb.0: ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647] +; CHECK-SSE41-NEXT: pinsrd $2, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm2 ; CHECK-SSE41-NEXT: pand %xmm0, %xmm2 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm2 ; CHECK-SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -1051,7 +1052,8 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-AVX1-LABEL: test_srem_odd_INT_MIN: ; CHECK-AVX1: # %bb.0: ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpinsrd $2, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm2 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 ; CHECK-AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 @@ -1122,10 +1124,12 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pxor %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; CHECK-SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,2] ; CHECK-SSE2-NEXT: psrld $31, %xmm1 ; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq @@ -1147,7 +1151,8 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [306783378,306783378,1,306783378] ; CHECK-SSE41-NEXT: pminud %xmm3, %xmm2 ; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm2 -; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pinsrd $2, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm3 +; CHECK-SSE41-NEXT: pand %xmm3, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] ; CHECK-SSE41-NEXT: psrld $31, %xmm0 @@ -1168,7 +1173,8 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2 ; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpinsrd $2, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm0, %xmm3 +; CHECK-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 @@ -1238,10 +1244,12 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pxor %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; CHECK-SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pcmpeqd %xmm2, %xmm3 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,2] ; CHECK-SSE2-NEXT: psrld $31, %xmm1 ; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq @@ -1263,7 +1271,8 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [858993458,306783378,1,42949672] ; CHECK-SSE41-NEXT: pminud %xmm3, %xmm2 ; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm2 -; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pinsrd $2, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm3 +; CHECK-SSE41-NEXT: pand %xmm3, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] ; CHECK-SSE41-NEXT: psrld $31, %xmm0 @@ -1284,7 +1293,8 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2 ; CHECK-AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpinsrd $2, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm0, %xmm3 +; CHECK-AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 @@ -2211,59 +2221,61 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-SSE2-LABEL: pr51133: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: movq %rdi, %rax -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm5 -; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [9,0,41,183,1,1,161,221] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; CHECK-SSE2-NEXT: pand %xmm4, %xmm5 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm6 -; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [171,103,183,171,61,1,127,183] -; CHECK-SSE2-NEXT: pand %xmm4, %xmm6 -; CHECK-SSE2-NEXT: packuswb %xmm5, %xmm6 -; CHECK-SSE2-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 -; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm5 -; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [128,1,128,1,128,32,1,1] -; CHECK-SSE2-NEXT: psrlw $8, %xmm5 -; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [1,1,1,128,64,2,1,32] -; CHECK-SSE2-NEXT: psrlw $8, %xmm6 -; CHECK-SSE2-NEXT: packuswb %xmm5, %xmm6 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm7 = [84,2,36,42,2,1,2,4,2,255,4,36,127,31,2,2] -; CHECK-SSE2-NEXT: pminub %xmm6, %xmm7 -; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm7 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; CHECK-SSE2-NEXT: pandn %xmm5, %xmm7 -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm6, %xmm6 -; CHECK-SSE2-NEXT: pcmpgtb %xmm6, %xmm1 -; CHECK-SSE2-NEXT: pandn %xmm1, %xmm5 -; CHECK-SSE2-NEXT: por %xmm7, %xmm5 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm4 +; CHECK-SSE2-NEXT: movzbl {{\.?LCPI[0-9]+_[0-9]+}}+5(%rip), %ecx +; CHECK-SSE2-NEXT: movd %ecx, %xmm6 +; CHECK-SSE2-NEXT: psllq $40, %xmm6 +; CHECK-SSE2-NEXT: pand %xmm1, %xmm6 ; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [223,223,205,183,161,1,171,239] -; CHECK-SSE2-NEXT: pand %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [9,0,41,183,1,1,161,221] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm1 +; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [171,103,183,171,61,1,127,183] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm4 +; CHECK-SSE2-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm1 +; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [128,1,128,1,128,32,1,1] +; CHECK-SSE2-NEXT: psrlw $8, %xmm1 +; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [1,1,1,128,64,2,1,32] +; CHECK-SSE2-NEXT: psrlw $8, %xmm4 +; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm4 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm7 = [84,2,36,42,2,1,2,4,2,255,4,36,127,31,2,2] +; CHECK-SSE2-NEXT: pminub %xmm4, %xmm7 +; CHECK-SSE2-NEXT: pcmpeqb %xmm4, %xmm7 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] +; CHECK-SSE2-NEXT: pandn %xmm4, %xmm7 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pcmpgtb %xmm1, %xmm6 +; CHECK-SSE2-NEXT: pandn %xmm6, %xmm4 +; CHECK-SSE2-NEXT: por %xmm7, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm6 +; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [223,223,205,183,161,1,171,239] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm6 ; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [197,205,27,241,1,1,1,163] -; CHECK-SSE2-NEXT: pand %xmm4, %xmm0 -; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pand %xmm5, %xmm0 +; CHECK-SSE2-NEXT: packuswb %xmm6, %xmm0 ; CHECK-SSE2-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [128,128,1,1,1,128,1,64] -; CHECK-SSE2-NEXT: psrlw $8, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm5 +; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [128,128,1,1,1,128,1,64] +; CHECK-SSE2-NEXT: psrlw $8, %xmm5 ; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,1,128,128,32,128,32] ; CHECK-SSE2-NEXT: psrlw $8, %xmm0 -; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm0 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [19,51,13,7,128,32,128,3,5,5,51,37,3,128,85,5] -; CHECK-SSE2-NEXT: pmaxub %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm3 -; CHECK-SSE2-NEXT: pandn %xmm5, %xmm3 -; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm2 -; CHECK-SSE2-NEXT: pandn %xmm1, %xmm2 +; CHECK-SSE2-NEXT: packuswb %xmm5, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [19,51,13,7,128,32,128,3,5,5,51,37,3,128,85,5] +; CHECK-SSE2-NEXT: pmaxub %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pcmpeqb %xmm0, %xmm5 +; CHECK-SSE2-NEXT: pcmpeqb %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pandn %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pcmpeqb %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pandn %xmm5, %xmm2 ; CHECK-SSE2-NEXT: pmovmskb %xmm2, %ecx ; CHECK-SSE2-NEXT: pmovmskb %xmm3, %edx ; CHECK-SSE2-NEXT: shll $16, %edx @@ -2474,7 +2486,9 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 ; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] ; CHECK-AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm2 -; CHECK-AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; CHECK-AVX512VL-NEXT: vpinsrb $5, {{\.?LCPI[0-9]+_[0-9]+}}+21(%rip), %xmm0, %xmm4 +; CHECK-AVX512VL-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; CHECK-AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0 ; CHECK-AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; CHECK-AVX512VL-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0 ; CHECK-AVX512VL-NEXT: vpandn %ymm0, %ymm3, %ymm3 diff --git a/llvm/test/CodeGen/X86/sse-align-12.ll b/llvm/test/CodeGen/X86/sse-align-12.ll index 7b4bd3ffdf00c..b8873e3839cd2 100644 --- a/llvm/test/CodeGen/X86/sse-align-12.ll +++ b/llvm/test/CodeGen/X86/sse-align-12.ll @@ -54,8 +54,8 @@ define <2 x double> @c(ptr %y) nounwind { define <2 x double> @d(ptr %y, <2 x double> %z) nounwind { ; CHECK-LABEL: d: ; CHECK: # %bb.0: -; CHECK-NEXT: movups (%rdi), %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1] ; CHECK-NEXT: retq %x = load <2 x double>, ptr %y, align 8 %a = extractelement <2 x double> %x, i32 1 diff --git a/llvm/test/CodeGen/X86/sse2.ll b/llvm/test/CodeGen/X86/sse2.ll index cf5f527b16114..67c0f393c0860 100644 --- a/llvm/test/CodeGen/X86/sse2.ll +++ b/llvm/test/CodeGen/X86/sse2.ll @@ -392,19 +392,19 @@ define <2 x double> @test11(double %a, double %b) nounwind { define void @test12() nounwind { ; SSE-LABEL: test12: ; SSE: # %bb.0: -; SSE-NEXT: movapd 0, %xmm0 -; SSE-NEXT: movapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: xorps %xmm2, %xmm2 -; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] -; SSE-NEXT: addps %xmm1, %xmm2 -; SSE-NEXT: movaps %xmm2, 0 +; SSE-NEXT: movaps 0, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; SSE-NEXT: addps %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, 0 ; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-LABEL: test12: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovaps 0, %xmm0 -; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] ; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0 @@ -529,29 +529,45 @@ define <4 x float> @test15(ptr %x, ptr %y) nounwind { ; X86-SSE: # %bb.0: # %entry ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movaps (%ecx), %xmm0 -; X86-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; X86-SSE-NEXT: retl ; -; X86-AVX-LABEL: test15: -; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: vmovaps (%ecx), %xmm0 -; X86-AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test15: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; X86-AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: test15: +; X86-AVX512: # %bb.0: # %entry +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X86-AVX512-NEXT: vunpcklpd 8(%eax){1to2}, %xmm0, %xmm0 +; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: test15: ; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movaps (%rdi), %xmm0 -; X64-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; X64-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X64-SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: test15: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 -; X64-AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test15: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X64-AVX1-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; X64-AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: test15: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X64-AVX512-NEXT: vunpcklpd 8(%rsi){1to2}, %xmm0, %xmm0 +; X64-AVX512-NEXT: retq entry: %tmp = load <4 x float>, ptr %y ; <<4 x float>> [#uses=1] %tmp3 = load <4 x float>, ptr %x ; <<4 x float>> [#uses=1] @@ -565,27 +581,27 @@ define <2 x double> @test16(ptr nocapture %srcA, ptr nocapture %dst) { ; X86-SSE-LABEL: test16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movaps 96(%eax), %xmm0 -; X86-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: test16: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vmovaps 96(%eax), %xmm0 -; X86-AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: test16: ; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movaps 96(%rdi), %xmm0 -; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; X64-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X64-SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; X64-SSE-NEXT: retq ; ; X64-AVX-LABEL: test16: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps 96(%rdi), %xmm0 -; X64-AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X64-AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; X64-AVX-NEXT: retq %i5 = getelementptr inbounds <4 x double>, ptr %srcA, i32 3 %i6 = load <4 x double>, ptr %i5, align 32 @@ -700,8 +716,3 @@ define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) { %m = mul <4 x i32> %x, %y ret <4 x i32> %m } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; X64-AVX1: {{.*}} -; X64-AVX512: {{.*}} -; X86-AVX1: {{.*}} -; X86-AVX512: {{.*}} diff --git a/llvm/test/CodeGen/X86/sse3.ll b/llvm/test/CodeGen/X86/sse3.ll index 1a4df9a175ffa..2d3008d980a6d 100644 --- a/llvm/test/CodeGen/X86/sse3.ll +++ b/llvm/test/CodeGen/X86/sse3.ll @@ -39,20 +39,22 @@ define <8 x i16> @t1(ptr %A, ptr %B) nounwind { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-NEXT: movaps {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,65535] -; X86-NEXT: movaps %xmm0, %xmm1 -; X86-NEXT: andnps (%ecx), %xmm1 -; X86-NEXT: andps (%eax), %xmm0 -; X86-NEXT: orps %xmm1, %xmm0 +; X86-NEXT: movaps (%eax), %xmm2 +; X86-NEXT: andps %xmm0, %xmm2 +; X86-NEXT: andnps %xmm1, %xmm0 +; X86-NEXT: orps %xmm2, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: t1: ; X64: # %bb.0: +; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-NEXT: movaps {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,65535] -; X64-NEXT: movaps %xmm0, %xmm1 -; X64-NEXT: andnps (%rsi), %xmm1 -; X64-NEXT: andps (%rdi), %xmm0 -; X64-NEXT: orps %xmm1, %xmm0 +; X64-NEXT: movaps (%rdi), %xmm2 +; X64-NEXT: andps %xmm0, %xmm2 +; X64-NEXT: andnps %xmm1, %xmm0 +; X64-NEXT: orps %xmm2, %xmm0 ; X64-NEXT: retq %tmp1 = load <8 x i16>, ptr %A %tmp2 = load <8 x i16>, ptr %B @@ -395,14 +397,14 @@ entry: define <4 x i32> @t17() nounwind { ; X86-LABEL: t17: ; X86: # %bb.0: # %entry -; X86-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] ; X86-NEXT: retl ; ; X64-LABEL: t17: ; X64: # %bb.0: # %entry -; X64-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1] -; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] ; X64-NEXT: retq entry: %tmp1 = load <4 x float>, ptr undef, align 16 diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll index 2d7258a49f5d0..07d7f82d80906 100644 --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -560,46 +560,40 @@ define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, ptr nocapture ; X86-SSE-LABEL: insertps_from_shufflevector_1: ; X86-SSE: ## %bb.0: ## %entry ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-SSE-NEXT: movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08] -; X86-SSE-NEXT: insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30] -; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] +; X86-SSE-NEXT: insertps $48, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x00,0x30] +; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; X86-SSE-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX1-LABEL: insertps_from_shufflevector_1: ; X86-AVX1: ## %bb.0: ## %entry ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08] -; X86-AVX1-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] -; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] +; X86-AVX1-NEXT: vinsertps $48, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30] +; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; X86-AVX1-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512-LABEL: insertps_from_shufflevector_1: ; X86-AVX512: ## %bb.0: ## %entry ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08] -; X86-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] -; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] +; X86-AVX512-NEXT: vinsertps $48, (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30] +; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; ; X64-SSE-LABEL: insertps_from_shufflevector_1: ; X64-SSE: ## %bb.0: ## %entry -; X64-SSE-NEXT: movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f] -; X64-SSE-NEXT: insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30] -; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] +; X64-SSE-NEXT: insertps $48, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x07,0x30] +; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; X64-SSE-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX1-LABEL: insertps_from_shufflevector_1: ; X64-AVX1: ## %bb.0: ## %entry -; X64-AVX1-NEXT: vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f] -; X64-AVX1-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] -; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] +; X64-AVX1-NEXT: vinsertps $48, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30] +; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; X64-AVX1-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512-LABEL: insertps_from_shufflevector_1: ; X64-AVX512: ## %bb.0: ## %entry -; X64-AVX512-NEXT: vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f] -; X64-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] -; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] +; X64-AVX512-NEXT: vinsertps $48, (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30] +; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] entry: %0 = load <4 x float>, ptr %pb, align 16 @@ -636,8 +630,10 @@ define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, ptr nocapture read ; X86-SSE-LABEL: pinsrd_from_shufflevector_i32: ; X86-SSE: ## %bb.0: ## %entry ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-SSE-NEXT: pshufd $0, (%eax), %xmm1 ## encoding: [0x66,0x0f,0x70,0x08,0x00] -; X86-SSE-NEXT: ## xmm1 = mem[0,0,0,0] +; X86-SSE-NEXT: movd (%eax), %xmm1 ## xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: ## encoding: [0x66,0x0f,0x6e,0x08] +; X86-SSE-NEXT: pshufd $0, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x00] +; X86-SSE-NEXT: ## xmm1 = xmm1[0,0,0,0] ; X86-SSE-NEXT: pblendw $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc0] ; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; X86-SSE-NEXT: retl ## encoding: [0xc3] @@ -660,8 +656,10 @@ define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, ptr nocapture read ; ; X64-SSE-LABEL: pinsrd_from_shufflevector_i32: ; X64-SSE: ## %bb.0: ## %entry -; X64-SSE-NEXT: pshufd $0, (%rdi), %xmm1 ## encoding: [0x66,0x0f,0x70,0x0f,0x00] -; X64-SSE-NEXT: ## xmm1 = mem[0,0,0,0] +; X64-SSE-NEXT: movd (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: ## encoding: [0x66,0x0f,0x6e,0x0f] +; X64-SSE-NEXT: pshufd $0, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x00] +; X64-SSE-NEXT: ## xmm1 = xmm1[0,0,0,0] ; X64-SSE-NEXT: pblendw $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc0] ; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; X64-SSE-NEXT: retq ## encoding: [0xc3] @@ -1372,46 +1370,40 @@ define <4 x float> @insertps_from_vector_load(<4 x float> %a, ptr nocapture read ; X86-SSE-LABEL: insertps_from_vector_load: ; X86-SSE: ## %bb.0: ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-SSE-NEXT: movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08] -; X86-SSE-NEXT: insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30] -; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] +; X86-SSE-NEXT: insertps $48, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x00,0x30] +; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; X86-SSE-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX1-LABEL: insertps_from_vector_load: ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08] -; X86-AVX1-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] -; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] +; X86-AVX1-NEXT: vinsertps $48, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30] +; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; X86-AVX1-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512-LABEL: insertps_from_vector_load: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08] -; X86-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] -; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] +; X86-AVX512-NEXT: vinsertps $48, (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30] +; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; ; X64-SSE-LABEL: insertps_from_vector_load: ; X64-SSE: ## %bb.0: -; X64-SSE-NEXT: movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f] -; X64-SSE-NEXT: insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30] -; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] +; X64-SSE-NEXT: insertps $48, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x07,0x30] +; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; X64-SSE-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX1-LABEL: insertps_from_vector_load: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f] -; X64-AVX1-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] -; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] +; X64-AVX1-NEXT: vinsertps $48, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30] +; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; X64-AVX1-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512-LABEL: insertps_from_vector_load: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f] -; X64-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] -; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] +; X64-AVX512-NEXT: vinsertps $48, (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30] +; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %1 = load <4 x float>, ptr %pb, align 16 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48) @@ -1424,46 +1416,40 @@ define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, ptr nocaptu ; X86-SSE-LABEL: insertps_from_vector_load_offset: ; X86-SSE: ## %bb.0: ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-SSE-NEXT: movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08] -; X86-SSE-NEXT: insertps $96, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x60] -; X86-SSE-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] +; X86-SSE-NEXT: insertps $32, 4(%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x40,0x04,0x20] +; X86-SSE-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] ; X86-SSE-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX1-LABEL: insertps_from_vector_load_offset: ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08] -; X86-AVX1-NEXT: vinsertps $96, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60] -; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] +; X86-AVX1-NEXT: vinsertps $32, 4(%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x40,0x04,0x20] +; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] ; X86-AVX1-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512-LABEL: insertps_from_vector_load_offset: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08] -; X86-AVX512-NEXT: vinsertps $96, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60] -; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] +; X86-AVX512-NEXT: vinsertps $32, 4(%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x40,0x04,0x20] +; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; ; X64-SSE-LABEL: insertps_from_vector_load_offset: ; X64-SSE: ## %bb.0: -; X64-SSE-NEXT: movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f] -; X64-SSE-NEXT: insertps $96, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x60] -; X64-SSE-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] +; X64-SSE-NEXT: insertps $32, 4(%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x47,0x04,0x20] +; X64-SSE-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] ; X64-SSE-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX1-LABEL: insertps_from_vector_load_offset: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f] -; X64-AVX1-NEXT: vinsertps $96, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60] -; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] +; X64-AVX1-NEXT: vinsertps $32, 4(%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x47,0x04,0x20] +; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] ; X64-AVX1-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512-LABEL: insertps_from_vector_load_offset: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f] -; X64-AVX512-NEXT: vinsertps $96, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60] -; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] +; X64-AVX512-NEXT: vinsertps $32, 4(%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x47,0x04,0x20] +; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1],mem[0],xmm0[3] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %1 = load <4 x float>, ptr %pb, align 16 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96) @@ -1477,9 +1463,10 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, ptr nocap ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08] ; X86-SSE-NEXT: shll $4, %ecx ## encoding: [0xc1,0xe1,0x04] -; X86-SSE-NEXT: movaps (%eax,%ecx), %xmm1 ## encoding: [0x0f,0x28,0x0c,0x08] -; X86-SSE-NEXT: insertps $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xc0] -; X86-SSE-NEXT: ## xmm0 = xmm1[3],xmm0[1,2,3] +; X86-SSE-NEXT: movss 12(%eax,%ecx), %xmm1 ## xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: ## encoding: [0xf3,0x0f,0x10,0x4c,0x08,0x0c] +; X86-SSE-NEXT: blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01] +; X86-SSE-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] ; X86-SSE-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX1-LABEL: insertps_from_vector_load_offset_2: @@ -1487,9 +1474,10 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, ptr nocap ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08] ; X86-AVX1-NEXT: shll $4, %ecx ## encoding: [0xc1,0xe1,0x04] -; X86-AVX1-NEXT: vmovaps (%eax,%ecx), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0c,0x08] -; X86-AVX1-NEXT: vinsertps $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0] -; X86-AVX1-NEXT: ## xmm0 = xmm1[3],xmm0[1,2,3] +; X86-AVX1-NEXT: vmovss 12(%eax,%ecx), %xmm1 ## xmm1 = mem[0],zero,zero,zero +; X86-AVX1-NEXT: ## encoding: [0xc5,0xfa,0x10,0x4c,0x08,0x0c] +; X86-AVX1-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01] +; X86-AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] ; X86-AVX1-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512-LABEL: insertps_from_vector_load_offset_2: @@ -1497,33 +1485,37 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, ptr nocap ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08] ; X86-AVX512-NEXT: shll $4, %ecx ## encoding: [0xc1,0xe1,0x04] -; X86-AVX512-NEXT: vmovaps (%eax,%ecx), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0c,0x08] -; X86-AVX512-NEXT: vinsertps $192, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0] -; X86-AVX512-NEXT: ## xmm0 = xmm1[3],xmm0[1,2,3] +; X86-AVX512-NEXT: vmovss 12(%eax,%ecx), %xmm1 ## xmm1 = mem[0],zero,zero,zero +; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x08,0x0c] +; X86-AVX512-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01] +; X86-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; ; X64-SSE-LABEL: insertps_from_vector_load_offset_2: ; X64-SSE: ## %bb.0: ; X64-SSE-NEXT: shlq $4, %rsi ## encoding: [0x48,0xc1,0xe6,0x04] -; X64-SSE-NEXT: movaps (%rdi,%rsi), %xmm1 ## encoding: [0x0f,0x28,0x0c,0x37] -; X64-SSE-NEXT: insertps $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xc0] -; X64-SSE-NEXT: ## xmm0 = xmm1[3],xmm0[1,2,3] +; X64-SSE-NEXT: movss 12(%rdi,%rsi), %xmm1 ## xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: ## encoding: [0xf3,0x0f,0x10,0x4c,0x37,0x0c] +; X64-SSE-NEXT: blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01] +; X64-SSE-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] ; X64-SSE-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX1-LABEL: insertps_from_vector_load_offset_2: ; X64-AVX1: ## %bb.0: ; X64-AVX1-NEXT: shlq $4, %rsi ## encoding: [0x48,0xc1,0xe6,0x04] -; X64-AVX1-NEXT: vmovaps (%rdi,%rsi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0c,0x37] -; X64-AVX1-NEXT: vinsertps $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0] -; X64-AVX1-NEXT: ## xmm0 = xmm1[3],xmm0[1,2,3] +; X64-AVX1-NEXT: vmovss 12(%rdi,%rsi), %xmm1 ## xmm1 = mem[0],zero,zero,zero +; X64-AVX1-NEXT: ## encoding: [0xc5,0xfa,0x10,0x4c,0x37,0x0c] +; X64-AVX1-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01] +; X64-AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] ; X64-AVX1-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512-LABEL: insertps_from_vector_load_offset_2: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: shlq $4, %rsi ## encoding: [0x48,0xc1,0xe6,0x04] -; X64-AVX512-NEXT: vmovaps (%rdi,%rsi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0c,0x37] -; X64-AVX512-NEXT: vinsertps $192, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0] -; X64-AVX512-NEXT: ## xmm0 = xmm1[3],xmm0[1,2,3] +; X64-AVX512-NEXT: vmovss 12(%rdi,%rsi), %xmm1 ## xmm1 = mem[0],zero,zero,zero +; X64-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x37,0x0c] +; X64-AVX512-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01] +; X64-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %1 = getelementptr inbounds <4 x float>, ptr %pb, i64 %index %2 = load <4 x float>, ptr %1, align 16 @@ -1587,9 +1579,8 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, ptr nocapt ; X86-SSE-LABEL: insertps_from_broadcast_loadv4f32: ; X86-SSE: ## %bb.0: ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-SSE-NEXT: movups (%eax), %xmm1 ## encoding: [0x0f,0x10,0x08] -; X86-SSE-NEXT: insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30] -; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] +; X86-SSE-NEXT: insertps $48, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x00,0x30] +; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; X86-SSE-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX1-LABEL: insertps_from_broadcast_loadv4f32: @@ -1608,9 +1599,8 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, ptr nocapt ; ; X64-SSE-LABEL: insertps_from_broadcast_loadv4f32: ; X64-SSE: ## %bb.0: -; X64-SSE-NEXT: movups (%rdi), %xmm1 ## encoding: [0x0f,0x10,0x0f] -; X64-SSE-NEXT: insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30] -; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] +; X64-SSE-NEXT: insertps $48, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x07,0x30] +; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] ; X64-SSE-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX1-LABEL: insertps_from_broadcast_loadv4f32: @@ -1819,46 +1809,40 @@ define <4 x float> @pr20087(<4 x float> %a, ptr%ptr) { ; X86-SSE-LABEL: pr20087: ; X86-SSE: ## %bb.0: ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-SSE-NEXT: movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08] -; X86-SSE-NEXT: insertps $178, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xb2] -; X86-SSE-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2] +; X86-SSE-NEXT: insertps $50, 8(%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x40,0x08,0x32] +; X86-SSE-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],mem[0] ; X86-SSE-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX1-LABEL: pr20087: ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08] -; X86-AVX1-NEXT: vinsertps $178, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2] -; X86-AVX1-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2] +; X86-AVX1-NEXT: vinsertps $50, 8(%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x40,0x08,0x32] +; X86-AVX1-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],mem[0] ; X86-AVX1-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512-LABEL: pr20087: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08] -; X86-AVX512-NEXT: vinsertps $178, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2] -; X86-AVX512-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2] +; X86-AVX512-NEXT: vinsertps $50, 8(%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x40,0x08,0x32] +; X86-AVX512-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],mem[0] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; ; X64-SSE-LABEL: pr20087: ; X64-SSE: ## %bb.0: -; X64-SSE-NEXT: movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f] -; X64-SSE-NEXT: insertps $178, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xb2] -; X64-SSE-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2] +; X64-SSE-NEXT: insertps $50, 8(%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x47,0x08,0x32] +; X64-SSE-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],mem[0] ; X64-SSE-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX1-LABEL: pr20087: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f] -; X64-AVX1-NEXT: vinsertps $178, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2] -; X64-AVX1-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2] +; X64-AVX1-NEXT: vinsertps $50, 8(%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x47,0x08,0x32] +; X64-AVX1-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],mem[0] ; X64-AVX1-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512-LABEL: pr20087: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f] -; X64-AVX512-NEXT: vinsertps $178, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2] -; X64-AVX512-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2] +; X64-AVX512-NEXT: vinsertps $50, 8(%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x47,0x08,0x32] +; X64-AVX512-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],mem[0] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %load = load <4 x float> , ptr%ptr %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> diff --git a/llvm/test/CodeGen/X86/strict-fsub-combines.ll b/llvm/test/CodeGen/X86/strict-fsub-combines.ll index 774ea02ccd87a..be491bc330129 100644 --- a/llvm/test/CodeGen/X86/strict-fsub-combines.ll +++ b/llvm/test/CodeGen/X86/strict-fsub-combines.ll @@ -8,9 +8,10 @@ define float @fneg_strict_fsub_to_strict_fadd(float %x, float %y) nounwind stric ; X86: # %bb.0: ; X86-NEXT: pushl %eax ; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-NEXT: subss %xmm1, %xmm0 +; X86-NEXT: movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; X86-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-NEXT: xorps %xmm1, %xmm2 +; X86-NEXT: subss %xmm2, %xmm0 ; X86-NEXT: movss %xmm0, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: wait @@ -19,8 +20,9 @@ define float @fneg_strict_fsub_to_strict_fadd(float %x, float %y) nounwind stric ; ; X64-LABEL: fneg_strict_fsub_to_strict_fadd: ; X64: # %bb.0: -; X64-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; X64-NEXT: subss %xmm1, %xmm0 +; X64-NEXT: movss {{.*#+}} xmm2 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; X64-NEXT: xorps %xmm1, %xmm2 +; X64-NEXT: subss %xmm2, %xmm0 ; X64-NEXT: retq %neg = fneg float %y %sub = call float @llvm.experimental.constrained.fsub.f32(float %x, float %neg, metadata!"round.dynamic", metadata!"fpexcept.strict") @@ -48,8 +50,9 @@ define double @fneg_strict_fsub_to_strict_fadd_d(double %x, double %y) nounwind ; ; X64-LABEL: fneg_strict_fsub_to_strict_fadd_d: ; X64: # %bb.0: -; X64-NEXT: xorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; X64-NEXT: subsd %xmm1, %xmm0 +; X64-NEXT: movsd {{.*#+}} xmm2 = [-0.0E+0,0.0E+0] +; X64-NEXT: xorpd %xmm1, %xmm2 +; X64-NEXT: subsd %xmm2, %xmm0 ; X64-NEXT: retq %neg = fneg double %y %sub = call double @llvm.experimental.constrained.fsub.f64(double %x, double %neg, metadata!"round.dynamic", metadata!"fpexcept.strict") @@ -63,8 +66,9 @@ define float @strict_fsub_fneg_to_strict_fsub(float %x, float %y) nounwind stric ; X86-NEXT: pushl %eax ; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: subss {{[0-9]+}}(%esp), %xmm0 -; X86-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-NEXT: movss %xmm0, (%esp) +; X86-NEXT: movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; X86-NEXT: xorps %xmm0, %xmm1 +; X86-NEXT: movss %xmm1, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: wait ; X86-NEXT: popl %eax @@ -73,7 +77,8 @@ define float @strict_fsub_fneg_to_strict_fsub(float %x, float %y) nounwind stric ; X64-LABEL: strict_fsub_fneg_to_strict_fsub: ; X64: # %bb.0: ; X64-NEXT: subss %xmm1, %xmm0 -; X64-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0] +; X64-NEXT: xorps %xmm1, %xmm0 ; X64-NEXT: retq %sub = call float @llvm.experimental.constrained.fsub.f32(float %x, float %y, metadata!"round.dynamic", metadata!"fpexcept.strict") %neg = fneg float %sub @@ -101,7 +106,8 @@ define double @strict_fsub_fneg_to_strict_fsub_d(double %x, double %y) nounwind ; X64-LABEL: strict_fsub_fneg_to_strict_fsub_d: ; X64: # %bb.0: ; X64-NEXT: subsd %xmm1, %xmm0 -; X64-NEXT: xorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0] +; X64-NEXT: xorpd %xmm1, %xmm0 ; X64-NEXT: retq %sub = call double @llvm.experimental.constrained.fsub.f64(double %x, double %y, metadata!"round.dynamic", metadata!"fpexcept.strict") %neg = fneg double %sub diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll index 76183ac5f8fa3..8d227493f3bbb 100644 --- a/llvm/test/CodeGen/X86/subvector-broadcast.ll +++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll @@ -1647,13 +1647,13 @@ define <4 x double> @broadcast_v4f64_v2f64_4u61(ptr %vp, <4 x double> %default) ; X86-LABEL: broadcast_v4f64_v2f64_4u61: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vinsertf128 $1, (%eax), %ymm0, %ymm1 +; X86-NEXT: vbroadcastsd 8(%eax), %ymm1 ; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; X86-NEXT: retl ; ; X64-LABEL: broadcast_v4f64_v2f64_4u61: ; X64: # %bb.0: -; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm1 +; X64-NEXT: vbroadcastsd 8(%rdi), %ymm1 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; X64-NEXT: retq %vec = load <2 x double>, ptr %vp diff --git a/llvm/test/CodeGen/X86/test-shrink-bug.ll b/llvm/test/CodeGen/X86/test-shrink-bug.ll index 953a0d65c5386..029c76a9f3ad3 100644 --- a/llvm/test/CodeGen/X86/test-shrink-bug.ll +++ b/llvm/test/CodeGen/X86/test-shrink-bug.ll @@ -66,8 +66,11 @@ define dso_local void @fail(i16 %a, <2 x i8> %b) { ; CHECK-X64-LABEL: fail: ; CHECK-X64: # %bb.0: ; CHECK-X64-NEXT: pslld $8, %xmm0 -; CHECK-X64-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-X64-NEXT: pextrw $1, %xmm0, %eax +; CHECK-X64-NEXT: movzbl {{\.?LCPI[0-9]+_[0-9]+}}+2(%rip), %eax +; CHECK-X64-NEXT: movd %eax, %xmm1 +; CHECK-X64-NEXT: pslld $16, %xmm1 +; CHECK-X64-NEXT: pcmpeqb %xmm0, %xmm1 +; CHECK-X64-NEXT: pextrw $1, %xmm1, %eax ; CHECK-X64-NEXT: xorb $1, %al ; CHECK-X64-NEXT: testl $263, %edi # imm = 0x107 ; CHECK-X64-NEXT: setne %cl diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll index 5ea991f85523e..e78b5e19c5dc2 100644 --- a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll +++ b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll @@ -663,70 +663,20 @@ define <8 x float> @transform_VUNPCKHPDYrm(<8 x float> %a, ptr %pb) nounwind { } define <4 x float> @transform_VUNPCKLPDrm(<4 x float> %a, ptr %pb) nounwind { -; CHECK-SKX-LABEL: transform_VUNPCKLPDrm: -; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; CHECK-SKX-NEXT: retq -; -; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm: -; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: -; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq -; -; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm: -; CHECK-ICX-BYPASS-DELAY: # %bb.0: -; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; CHECK-ICX-BYPASS-DELAY-NEXT: retq -; -; CHECK-V4-LABEL: transform_VUNPCKLPDrm: -; CHECK-V4: # %bb.0: -; CHECK-V4-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; CHECK-V4-NEXT: retq -; -; CHECK-AVX512-LABEL: transform_VUNPCKLPDrm: -; CHECK-AVX512: # %bb.0: -; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; CHECK-AVX512-NEXT: retq -; -; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrm: -; CHECK-ZNVER4: # %bb.0: -; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; CHECK-ZNVER4-NEXT: retq +; CHECK-LABEL: transform_VUNPCKLPDrm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; CHECK-NEXT: retq %b = load <4 x float>, ptr %pb %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shufp } define <4 x float> @transform_VUNPCKHPDrm(<4 x float> %a, ptr %pb) nounwind { -; CHECK-SKX-LABEL: transform_VUNPCKHPDrm: -; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; CHECK-SKX-NEXT: retq -; -; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm: -; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: -; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],mem[1] -; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq -; -; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm: -; CHECK-ICX-BYPASS-DELAY: # %bb.0: -; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; CHECK-ICX-BYPASS-DELAY-NEXT: retq -; -; CHECK-V4-LABEL: transform_VUNPCKHPDrm: -; CHECK-V4: # %bb.0: -; CHECK-V4-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; CHECK-V4-NEXT: retq -; -; CHECK-AVX512-LABEL: transform_VUNPCKHPDrm: -; CHECK-AVX512: # %bb.0: -; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; CHECK-AVX512-NEXT: retq -; -; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrm: -; CHECK-ZNVER4: # %bb.0: -; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; CHECK-ZNVER4-NEXT: retq +; CHECK-LABEL: transform_VUNPCKHPDrm: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm0 +; CHECK-NEXT: retq %b = load <4 x float>, ptr %pb %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shufp @@ -848,37 +798,43 @@ define <2 x double> @transform_VUNPCKLPDrmkz(<2 x double> %a, ptr %pb, i2 %mask_ ; CHECK-SKX-LABEL: transform_VUNPCKLPDrmkz: ; CHECK-SKX: # %bb.0: ; CHECK-SKX-NEXT: kmovd %esi, %k1 -; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] +; CHECK-SKX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] ; CHECK-SKX-NEXT: retq ; ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmkz: ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1 -; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq ; ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmkz: ; CHECK-ICX-BYPASS-DELAY: # %bb.0: ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1 -; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] +; CHECK-ICX-BYPASS-DELAY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] ; CHECK-ICX-BYPASS-DELAY-NEXT: retq ; ; CHECK-V4-LABEL: transform_VUNPCKLPDrmkz: ; CHECK-V4: # %bb.0: ; CHECK-V4-NEXT: kmovd %esi, %k1 -; CHECK-V4-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] +; CHECK-V4-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-V4-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] ; CHECK-V4-NEXT: retq ; ; CHECK-AVX512-LABEL: transform_VUNPCKLPDrmkz: ; CHECK-AVX512: # %bb.0: ; CHECK-AVX512-NEXT: kmovd %esi, %k1 -; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] +; CHECK-AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] ; CHECK-AVX512-NEXT: retq ; ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrmkz: ; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-ZNVER4-NEXT: kmovd %esi, %k1 -; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] +; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] ; CHECK-ZNVER4-NEXT: retq %mask = bitcast i2 %mask_int to <2 x i1> %b = load <2 x double>, ptr %pb @@ -888,41 +844,11 @@ define <2 x double> @transform_VUNPCKLPDrmkz(<2 x double> %a, ptr %pb, i2 %mask_ } define <2 x double> @transform_VUNPCKHPDrmkz(<2 x double> %a, ptr %pb, i2 %mask_int) nounwind { -; CHECK-SKX-LABEL: transform_VUNPCKHPDrmkz: -; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: kmovd %esi, %k1 -; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] -; CHECK-SKX-NEXT: retq -; -; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmkz: -; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: -; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1 -; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] -; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq -; -; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmkz: -; CHECK-ICX-BYPASS-DELAY: # %bb.0: -; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1 -; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] -; CHECK-ICX-BYPASS-DELAY-NEXT: retq -; -; CHECK-V4-LABEL: transform_VUNPCKHPDrmkz: -; CHECK-V4: # %bb.0: -; CHECK-V4-NEXT: kmovd %esi, %k1 -; CHECK-V4-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] -; CHECK-V4-NEXT: retq -; -; CHECK-AVX512-LABEL: transform_VUNPCKHPDrmkz: -; CHECK-AVX512: # %bb.0: -; CHECK-AVX512-NEXT: kmovd %esi, %k1 -; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] -; CHECK-AVX512-NEXT: retq -; -; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrmkz: -; CHECK-ZNVER4: # %bb.0: -; CHECK-ZNVER4-NEXT: kmovd %esi, %k1 -; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] -; CHECK-ZNVER4-NEXT: retq +; CHECK-LABEL: transform_VUNPCKHPDrmkz: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq %mask = bitcast i2 %mask_int to <2 x i1> %b = load <2 x double>, ptr %pb %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> @@ -1060,42 +986,48 @@ define <2 x double> @transform_VUNPCKLPDrmk(<2 x double> %a, ptr %pb, <2 x doubl ; CHECK-SKX-LABEL: transform_VUNPCKLPDrmk: ; CHECK-SKX: # %bb.0: ; CHECK-SKX-NEXT: kmovd %esi, %k1 -; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] +; CHECK-SKX-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; CHECK-SKX-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm2[0] ; CHECK-SKX-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-SKX-NEXT: retq ; ; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmk: ; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1 -; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm1 {%k1} = xmm0[0],xmm2[0] ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq ; ; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrmk: ; CHECK-ICX-BYPASS-DELAY: # %bb.0: ; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1 -; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] +; CHECK-ICX-BYPASS-DELAY-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; CHECK-ICX-BYPASS-DELAY-NEXT: vshufpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm2[0] ; CHECK-ICX-BYPASS-DELAY-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-ICX-BYPASS-DELAY-NEXT: retq ; ; CHECK-V4-LABEL: transform_VUNPCKLPDrmk: ; CHECK-V4: # %bb.0: ; CHECK-V4-NEXT: kmovd %esi, %k1 -; CHECK-V4-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] +; CHECK-V4-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; CHECK-V4-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm2[0] ; CHECK-V4-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-V4-NEXT: retq ; ; CHECK-AVX512-LABEL: transform_VUNPCKLPDrmk: ; CHECK-AVX512: # %bb.0: ; CHECK-AVX512-NEXT: kmovd %esi, %k1 -; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] +; CHECK-AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; CHECK-AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm2[0] ; CHECK-AVX512-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-AVX512-NEXT: retq ; ; CHECK-ZNVER4-LABEL: transform_VUNPCKLPDrmk: ; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero ; CHECK-ZNVER4-NEXT: kmovd %esi, %k1 -; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] +; CHECK-ZNVER4-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm2[0] ; CHECK-ZNVER4-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-ZNVER4-NEXT: retq %mask = bitcast i2 %mask_int to <2 x i1> @@ -1106,47 +1038,12 @@ define <2 x double> @transform_VUNPCKLPDrmk(<2 x double> %a, ptr %pb, <2 x doubl } define <2 x double> @transform_VUNPCKHPDrmk(<2 x double> %a, ptr %pb, <2 x double> %c, i2 %mask_int) nounwind { -; CHECK-SKX-LABEL: transform_VUNPCKHPDrmk: -; CHECK-SKX: # %bb.0: -; CHECK-SKX-NEXT: kmovd %esi, %k1 -; CHECK-SKX-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] -; CHECK-SKX-NEXT: vmovapd %xmm1, %xmm0 -; CHECK-SKX-NEXT: retq -; -; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmk: -; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: -; CHECK-ICX-NO-BYPASS-DELAY-NEXT: kmovd %esi, %k1 -; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] -; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vmovapd %xmm1, %xmm0 -; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq -; -; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrmk: -; CHECK-ICX-BYPASS-DELAY: # %bb.0: -; CHECK-ICX-BYPASS-DELAY-NEXT: kmovd %esi, %k1 -; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] -; CHECK-ICX-BYPASS-DELAY-NEXT: vmovapd %xmm1, %xmm0 -; CHECK-ICX-BYPASS-DELAY-NEXT: retq -; -; CHECK-V4-LABEL: transform_VUNPCKHPDrmk: -; CHECK-V4: # %bb.0: -; CHECK-V4-NEXT: kmovd %esi, %k1 -; CHECK-V4-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] -; CHECK-V4-NEXT: vmovapd %xmm1, %xmm0 -; CHECK-V4-NEXT: retq -; -; CHECK-AVX512-LABEL: transform_VUNPCKHPDrmk: -; CHECK-AVX512: # %bb.0: -; CHECK-AVX512-NEXT: kmovd %esi, %k1 -; CHECK-AVX512-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] -; CHECK-AVX512-NEXT: vmovapd %xmm1, %xmm0 -; CHECK-AVX512-NEXT: retq -; -; CHECK-ZNVER4-LABEL: transform_VUNPCKHPDrmk: -; CHECK-ZNVER4: # %bb.0: -; CHECK-ZNVER4-NEXT: kmovd %esi, %k1 -; CHECK-ZNVER4-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] -; CHECK-ZNVER4-NEXT: vmovapd %xmm1, %xmm0 -; CHECK-ZNVER4-NEXT: retq +; CHECK-LABEL: transform_VUNPCKHPDrmk: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 +; CHECK-NEXT: vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-NEXT: retq %mask = bitcast i2 %mask_int to <2 x i1> %b = load <2 x double>, ptr %pb %shufp = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll index 6940c33c9d327..ff02527b05d2d 100644 --- a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll +++ b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd.ll @@ -163,30 +163,10 @@ define <8 x float> @transform_VUNPCKHPDYrm(<8 x float> %a, ptr %pb) nounwind { } define <4 x float> @transform_VUNPCKLPDrm(<4 x float> %a, ptr %pb) nounwind { -; CHECK-AVX2-LABEL: transform_VUNPCKLPDrm: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; CHECK-AVX2-NEXT: retq -; -; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm: -; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: -; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq -; -; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm: -; CHECK-ICX-BYPASS-DELAY: # %bb.0: -; CHECK-ICX-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; CHECK-ICX-BYPASS-DELAY-NEXT: retq -; -; CHECK-SNB-NO-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm: -; CHECK-SNB-NO-BYPASS-DELAY: # %bb.0: -; CHECK-SNB-NO-BYPASS-DELAY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; CHECK-SNB-NO-BYPASS-DELAY-NEXT: retq -; -; CHECK-SNB-BYPASS-DELAY-LABEL: transform_VUNPCKLPDrm: -; CHECK-SNB-BYPASS-DELAY: # %bb.0: -; CHECK-SNB-BYPASS-DELAY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; CHECK-SNB-BYPASS-DELAY-NEXT: retq +; CHECK-LABEL: transform_VUNPCKLPDrm: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; CHECK-NEXT: retq %b = load <4 x float>, ptr %pb %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shufp @@ -195,34 +175,30 @@ define <4 x float> @transform_VUNPCKLPDrm(<4 x float> %a, ptr %pb) nounwind { define <4 x float> @transform_VUNPCKHPDrm(<4 x float> %a, ptr %pb) nounwind { ; CHECK-AVX2-LABEL: transform_VUNPCKHPDrm: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; CHECK-AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; CHECK-AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; CHECK-AVX2-NEXT: retq ; -; CHECK-ICX-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm: -; CHECK-ICX-NO-BYPASS-DELAY: # %bb.0: -; CHECK-ICX-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],mem[1] -; CHECK-ICX-NO-BYPASS-DELAY-NEXT: retq -; -; CHECK-ICX-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm: -; CHECK-ICX-BYPASS-DELAY: # %bb.0: -; CHECK-ICX-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] -; CHECK-ICX-BYPASS-DELAY-NEXT: retq +; CHECK-ICX-LABEL: transform_VUNPCKHPDrm: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: vunpckhpd 8(%rdi){1to2}, %xmm0, %xmm0 +; CHECK-ICX-NEXT: retq ; ; CHECK-SNB-NO-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm: ; CHECK-SNB-NO-BYPASS-DELAY: # %bb.0: -; CHECK-SNB-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],mem[1] +; CHECK-SNB-NO-BYPASS-DELAY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; CHECK-SNB-NO-BYPASS-DELAY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; CHECK-SNB-NO-BYPASS-DELAY-NEXT: retq ; ; CHECK-SNB-BYPASS-DELAY-LABEL: transform_VUNPCKHPDrm: ; CHECK-SNB-BYPASS-DELAY: # %bb.0: -; CHECK-SNB-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; CHECK-SNB-BYPASS-DELAY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; CHECK-SNB-BYPASS-DELAY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; CHECK-SNB-BYPASS-DELAY-NEXT: retq %b = load <4 x float>, ptr %pb %shufp = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shufp } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK: {{.*}} -; CHECK-ICX: {{.*}} ; CHECK-SKL: {{.*}} ; CHECK-V3: {{.*}} diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll index 36094fe56d577..b36592bf90ae7 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll @@ -243,7 +243,8 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind { ; CHECK-AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 -; CHECK-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vmovq {{.*#+}} xmm1 = [9223372036854775808,0] +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: movabsq $-3074457345618258603, %rax # imm = 0xD555555555555555 ; CHECK-AVX1-NEXT: vmovq %rax, %xmm1 ; CHECK-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 @@ -262,7 +263,8 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind { ; CHECK-AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 -; CHECK-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vmovq {{.*#+}} xmm1 = [9223372036854775808,0] +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: movabsq $-3074457345618258603, %rax # imm = 0xD555555555555555 ; CHECK-AVX2-NEXT: vmovq %rax, %xmm1 ; CHECK-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vec_insert-5.ll b/llvm/test/CodeGen/X86/vec_insert-5.ll index 91743898545ee..ace118ee17fad 100644 --- a/llvm/test/CodeGen/X86/vec_insert-5.ll +++ b/llvm/test/CodeGen/X86/vec_insert-5.ll @@ -34,18 +34,18 @@ define <4 x float> @t2(ptr %P) nounwind { ; X86-LABEL: t2: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorps %xmm0, %xmm0 -; X86-NEXT: xorps %xmm1, %xmm1 -; X86-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero +; X86-NEXT: pxor %xmm0, %xmm0 +; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; X86-NEXT: retl ; ; X64-LABEL: t2: ; X64: # %bb.0: -; X64-NEXT: xorps %xmm0, %xmm0 -; X64-NEXT: xorps %xmm1, %xmm1 -; X64-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] -; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero +; X64-NEXT: pxor %xmm0, %xmm0 +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; X64-NEXT: retq %tmp1 = load <4 x float>, ptr %P %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 4, i32 4, i32 4, i32 0 > @@ -56,14 +56,12 @@ define <4 x float> @t3(ptr %P) nounwind { ; X86-LABEL: t3: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorps %xmm0, %xmm0 -; X86-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X86-NEXT: retl ; ; X64-LABEL: t3: ; X64: # %bb.0: -; X64-NEXT: xorps %xmm0, %xmm0 -; X64-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X64-NEXT: retq %tmp1 = load <4 x float>, ptr %P %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 4, i32 4 > @@ -74,18 +72,12 @@ define <4 x float> @t4(ptr %P) nounwind { ; X86-LABEL: t4: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorps %xmm1, %xmm1 -; X86-NEXT: xorps %xmm0, %xmm0 -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,0] -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: retl ; ; X64-LABEL: t4: ; X64: # %bb.0: -; X64-NEXT: xorps %xmm1, %xmm1 -; X64-NEXT: xorps %xmm0, %xmm0 -; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,0] -; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NEXT: retq %tmp1 = load <4 x float>, ptr %P %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 > @@ -96,27 +88,13 @@ define <4 x float> @t4_under_aligned(ptr %P) nounwind { ; X86-LABEL: t4_under_aligned: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movups (%eax), %xmm0 -; X86-NEXT: xorps %xmm1, %xmm1 -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0] -; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: retl ; -; ALIGN-LABEL: t4_under_aligned: -; ALIGN: # %bb.0: -; ALIGN-NEXT: movups (%rdi), %xmm0 -; ALIGN-NEXT: xorps %xmm1, %xmm1 -; ALIGN-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0] -; ALIGN-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] -; ALIGN-NEXT: retq -; -; UNALIGN-LABEL: t4_under_aligned: -; UNALIGN: # %bb.0: -; UNALIGN-NEXT: xorps %xmm1, %xmm1 -; UNALIGN-NEXT: xorps %xmm0, %xmm0 -; UNALIGN-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],mem[3,0] -; UNALIGN-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] -; UNALIGN-NEXT: retq +; X64-LABEL: t4_under_aligned: +; X64: # %bb.0: +; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: retq %tmp1 = load <4 x float>, ptr %P, align 4 %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 > ret <4 x float> %tmp2 @@ -191,3 +169,6 @@ define <16 x i8> @t9(<16 x i8> %x) nounwind { %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> ret <16 x i8> %s } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; ALIGN: {{.*}} +; UNALIGN: {{.*}} diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index af841cf38b24a..c524e8956f790 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -4119,26 +4119,14 @@ define <8 x float> @sitofp_load_8i8_to_8f32(ptr%a) { define <4 x float> @uitofp_load_4i64_to_4f32(ptr%a) { ; SSE2-LABEL: uitofp_load_4i64_to_4f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movq 24(%rdi), %rax -; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: movq 16(%rdi), %rax +; SSE2-NEXT: movq 24(%rdi), %rcx +; SSE2-NEXT: testq %rcx, %rcx ; SSE2-NEXT: js .LBB83_1 ; SSE2-NEXT: # %bb.2: -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: jmp .LBB83_3 -; SSE2-NEXT: .LBB83_1: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: addss %xmm0, %xmm0 -; SSE2-NEXT: .LBB83_3: -; SSE2-NEXT: movq 16(%rdi), %rax +; SSE2-NEXT: cvtsi2ss %rcx, %xmm0 ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB83_4 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: jmp .LBB83_6 +; SSE2-NEXT: jns .LBB83_5 ; SSE2-NEXT: .LBB83_4: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx @@ -4146,6 +4134,18 @@ define <4 x float> @uitofp_load_4i64_to_4f32(ptr%a) { ; SSE2-NEXT: orq %rcx, %rax ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 ; SSE2-NEXT: addss %xmm1, %xmm1 +; SSE2-NEXT: jmp .LBB83_6 +; SSE2-NEXT: .LBB83_1: +; SSE2-NEXT: movq %rcx, %rdx +; SSE2-NEXT: shrq %rdx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: orq %rdx, %rcx +; SSE2-NEXT: cvtsi2ss %rcx, %xmm0 +; SSE2-NEXT: addss %xmm0, %xmm0 +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB83_4 +; SSE2-NEXT: .LBB83_5: +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 ; SSE2-NEXT: .LBB83_6: ; SSE2-NEXT: movq (%rdi), %rax ; SSE2-NEXT: movq 8(%rdi), %rcx @@ -4448,26 +4448,14 @@ define <4 x float> @uitofp_load_4i8_to_4f32(ptr%a) { define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) { ; SSE2-LABEL: uitofp_load_8i64_to_8f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movq 24(%rdi), %rax -; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: movq 16(%rdi), %rax +; SSE2-NEXT: movq 24(%rdi), %rcx +; SSE2-NEXT: testq %rcx, %rcx ; SSE2-NEXT: js .LBB87_1 ; SSE2-NEXT: # %bb.2: -; SSE2-NEXT: cvtsi2ss %rax, %xmm2 -; SSE2-NEXT: jmp .LBB87_3 -; SSE2-NEXT: .LBB87_1: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: cvtsi2ss %rax, %xmm2 -; SSE2-NEXT: addss %xmm2, %xmm2 -; SSE2-NEXT: .LBB87_3: -; SSE2-NEXT: movq 16(%rdi), %rax +; SSE2-NEXT: cvtsi2ss %rcx, %xmm2 ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB87_4 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: jmp .LBB87_6 +; SSE2-NEXT: jns .LBB87_5 ; SSE2-NEXT: .LBB87_4: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx @@ -4475,6 +4463,18 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) { ; SSE2-NEXT: orq %rcx, %rax ; SSE2-NEXT: cvtsi2ss %rax, %xmm1 ; SSE2-NEXT: addss %xmm1, %xmm1 +; SSE2-NEXT: jmp .LBB87_6 +; SSE2-NEXT: .LBB87_1: +; SSE2-NEXT: movq %rcx, %rdx +; SSE2-NEXT: shrq %rdx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: orq %rdx, %rcx +; SSE2-NEXT: cvtsi2ss %rcx, %xmm2 +; SSE2-NEXT: addss %xmm2, %xmm2 +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB87_4 +; SSE2-NEXT: .LBB87_5: +; SSE2-NEXT: cvtsi2ss %rax, %xmm1 ; SSE2-NEXT: .LBB87_6: ; SSE2-NEXT: movq (%rdi), %rax ; SSE2-NEXT: movq 8(%rdi), %rcx @@ -4504,26 +4504,14 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) { ; SSE2-NEXT: .LBB87_11: ; SSE2-NEXT: cvtsi2ss %rax, %xmm0 ; SSE2-NEXT: .LBB87_12: -; SSE2-NEXT: movq 56(%rdi), %rax -; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: movq 48(%rdi), %rax +; SSE2-NEXT: movq 56(%rdi), %rcx +; SSE2-NEXT: testq %rcx, %rcx ; SSE2-NEXT: js .LBB87_13 ; SSE2-NEXT: # %bb.14: -; SSE2-NEXT: cvtsi2ss %rax, %xmm5 -; SSE2-NEXT: jmp .LBB87_15 -; SSE2-NEXT: .LBB87_13: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: cvtsi2ss %rax, %xmm5 -; SSE2-NEXT: addss %xmm5, %xmm5 -; SSE2-NEXT: .LBB87_15: -; SSE2-NEXT: movq 48(%rdi), %rax +; SSE2-NEXT: cvtsi2ss %rcx, %xmm5 ; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB87_16 -; SSE2-NEXT: # %bb.17: -; SSE2-NEXT: cvtsi2ss %rax, %xmm4 -; SSE2-NEXT: jmp .LBB87_18 +; SSE2-NEXT: jns .LBB87_17 ; SSE2-NEXT: .LBB87_16: ; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq %rcx @@ -4531,28 +4519,40 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) { ; SSE2-NEXT: orq %rcx, %rax ; SSE2-NEXT: cvtsi2ss %rax, %xmm4 ; SSE2-NEXT: addss %xmm4, %xmm4 +; SSE2-NEXT: jmp .LBB87_18 +; SSE2-NEXT: .LBB87_13: +; SSE2-NEXT: movq %rcx, %rdx +; SSE2-NEXT: shrq %rdx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: orq %rdx, %rcx +; SSE2-NEXT: cvtsi2ss %rcx, %xmm5 +; SSE2-NEXT: addss %xmm5, %xmm5 +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB87_16 +; SSE2-NEXT: .LBB87_17: +; SSE2-NEXT: cvtsi2ss %rax, %xmm4 ; SSE2-NEXT: .LBB87_18: ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: movq 40(%rdi), %rax -; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: movq 32(%rdi), %rax +; SSE2-NEXT: movq 40(%rdi), %rcx +; SSE2-NEXT: testq %rcx, %rcx ; SSE2-NEXT: js .LBB87_19 ; SSE2-NEXT: # %bb.20: ; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: cvtsi2ss %rax, %xmm2 +; SSE2-NEXT: cvtsi2ss %rcx, %xmm2 ; SSE2-NEXT: jmp .LBB87_21 ; SSE2-NEXT: .LBB87_19: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: movq %rcx, %rdx +; SSE2-NEXT: shrq %rdx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: orq %rdx, %rcx ; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: cvtsi2ss %rax, %xmm2 +; SSE2-NEXT: cvtsi2ss %rcx, %xmm2 ; SSE2-NEXT: addss %xmm2, %xmm2 ; SSE2-NEXT: .LBB87_21: ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE2-NEXT: movq 32(%rdi), %rax ; SSE2-NEXT: testq %rax, %rax ; SSE2-NEXT: js .LBB87_22 ; SSE2-NEXT: # %bb.23: diff --git a/llvm/test/CodeGen/X86/vec_shift5.ll b/llvm/test/CodeGen/X86/vec_shift5.ll index 2ab00ea96ada1..5f8da88eb354d 100644 --- a/llvm/test/CodeGen/X86/vec_shift5.ll +++ b/llvm/test/CodeGen/X86/vec_shift5.ll @@ -215,7 +215,7 @@ define <4 x i32> @test18(<4 x i32> %a0, ptr %dummy) { define <4 x i32> @extelt0_sub_pslli_v4i32(<4 x i32> %x, <4 x i32> %y){ ; CHECK-LABEL: extelt0_sub_pslli_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32] +; CHECK-NEXT: movd {{.*#+}} xmm2 = [32,0,0,0] ; CHECK-NEXT: psubd %xmm1, %xmm2 ; CHECK-NEXT: pxor %xmm1, %xmm1 ; CHECK-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] @@ -228,23 +228,15 @@ define <4 x i32> @extelt0_sub_pslli_v4i32(<4 x i32> %x, <4 x i32> %y){ } define <4 x i32> @extelt1_add_psrli_v4i32(<4 x i32> %x, <4 x i32> %y){ -; X86-LABEL: extelt1_add_psrli_v4i32: -; X86: # %bb.0: -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X86-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-NEXT: xorps %xmm2, %xmm2 -; X86-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; X86-NEXT: psrld %xmm2, %xmm0 -; X86-NEXT: retl -; -; X64-LABEL: extelt1_add_psrli_v4i32: -; X64: # %bb.0: -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; X64-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; X64-NEXT: xorps %xmm2, %xmm2 -; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; X64-NEXT: psrld %xmm2, %xmm0 -; X64-NEXT: retq +; CHECK-LABEL: extelt1_add_psrli_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] +; CHECK-NEXT: movd {{.*#+}} xmm2 = [3,0,0,0] +; CHECK-NEXT: paddd %xmm1, %xmm2 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; CHECK-NEXT: psrld %xmm1, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} %ext = extractelement <4 x i32> %y, i64 1 %bo = add i32 %ext, 3 %r = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %x, i32 %bo) diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll index 5dcf19013f0b7..45c3b73a9948c 100644 --- a/llvm/test/CodeGen/X86/vector-bitreverse.ll +++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -54,8 +54,9 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind { ; ; XOP-LABEL: test_bitreverse_i8: ; XOP: # %bb.0: -; XOP-NEXT: vmovd %edi, %xmm0 -; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 +; XOP-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: vmovd %edi, %xmm1 +; XOP-NEXT: vpperm %xmm0, %xmm1, %xmm0, %xmm0 ; XOP-NEXT: vmovd %xmm0, %eax ; XOP-NEXT: # kill: def $al killed $al killed $eax ; XOP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-flags.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-flags.ll index 9c80720ae921a..818405349b716 100644 --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-flags.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-flags.ll @@ -1,30 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -O3 -mtriple=x86_64-pc-linux -stop-after=finalize-isel < %s | FileCheck %s define <1 x float> @constrained_vector_fadd_v1f32() #0 { -; CHECK-LABEL: name: constrained_vector_fadd_v1f32 -; CHECK: [[MOVSSrm_alt:%[0-9]+]]:fr32 = MOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool) -; CHECK: [[ADDSSrm:%[0-9]+]]:fr32 = ADDSSrm [[MOVSSrm_alt]], $rip, 1, $noreg, %const.1, $noreg, implicit $mxcsr :: (load (s32) from constant-pool) -; CHECK: $xmm0 = COPY [[ADDSSrm]] -; CHECK: RET 0, $xmm0 entry: %add = call <1 x float> @llvm.experimental.constrained.fadd.v1f32(<1 x float> , <1 x float> , metadata !"round.dynamic", metadata !"fpexcept.strict") #0 ret <1 x float> %add } define <3 x float> @constrained_vector_fadd_v3f32() #0 { -; CHECK-LABEL: name: constrained_vector_fadd_v3f32 -; CHECK: [[FsFLD0SS:%[0-9]+]]:fr32 = FsFLD0SS -; CHECK: [[MOVSSrm_alt:%[0-9]+]]:fr32 = MOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool) -; CHECK: [[ADDSSrr:%[0-9]+]]:fr32 = ADDSSrr [[MOVSSrm_alt]], killed [[FsFLD0SS]], implicit $mxcsr -; CHECK: [[ADDSSrm:%[0-9]+]]:fr32 = ADDSSrm [[MOVSSrm_alt]], $rip, 1, $noreg, %const.1, $noreg, implicit $mxcsr :: (load (s32) from constant-pool) -; CHECK: [[ADDSSrm1:%[0-9]+]]:fr32 = ADDSSrm [[MOVSSrm_alt]], $rip, 1, $noreg, %const.2, $noreg, implicit $mxcsr :: (load (s32) from constant-pool) -; CHECK: [[COPY:%[0-9]+]]:vr128 = COPY [[ADDSSrm1]] -; CHECK: [[COPY1:%[0-9]+]]:vr128 = COPY [[ADDSSrm]] -; CHECK: [[UNPCKLPSrr:%[0-9]+]]:vr128 = UNPCKLPSrr [[COPY1]], killed [[COPY]] -; CHECK: [[COPY2:%[0-9]+]]:vr128 = COPY [[ADDSSrr]] -; CHECK: [[UNPCKLPDrr:%[0-9]+]]:vr128 = UNPCKLPDrr [[UNPCKLPSrr]], killed [[COPY2]] -; CHECK: $xmm0 = COPY [[UNPCKLPDrr]] -; CHECK: RET 0, $xmm0 entry: %add = call <3 x float> @llvm.experimental.constrained.fadd.v3f32( <3 x float> @constrained_vector_fadd_v4f64() #0 { -; CHECK-LABEL: name: constrained_vector_fadd_v4f64 -; CHECK: [[MOVAPDrm:%[0-9]+]]:vr128 = MOVAPDrm $rip, 1, $noreg, %const.0, $noreg :: (load (s128) from constant-pool) -; CHECK: [[ADDPDrm:%[0-9]+]]:vr128 = ADDPDrm [[MOVAPDrm]], $rip, 1, $noreg, %const.1, $noreg, implicit $mxcsr :: (load (s128) from constant-pool) -; CHECK: [[ADDPDrm1:%[0-9]+]]:vr128 = ADDPDrm [[MOVAPDrm]], $rip, 1, $noreg, %const.2, $noreg, implicit $mxcsr :: (load (s128) from constant-pool) -; CHECK: $xmm0 = COPY [[ADDPDrm1]] -; CHECK: $xmm1 = COPY [[ADDPDrm]] -; CHECK: RET 0, $xmm0, $xmm1 entry: %add = call <4 x double> @llvm.experimental.constrained.fadd.v4f64( <4 x double> @llvm.experimental.constrained.fadd.v1f32(<1 x float>, <1 x float>, metadata, metadata) declare <3 x float> @llvm.experimental.constrained.fadd.v3f32(<3 x float>, <3 x float>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.fadd.v4f64(<4 x double>, <4 x double>, metadata, metadata) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll index 49062eaef3188..f25267f3dbd53 100644 --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -6319,31 +6319,28 @@ define <3 x double> @constrained_vector_round_v3f64_var(ptr %a) #0 { ; ; AVX-LABEL: constrained_vector_round_v3f64_var: ; AVX: # %bb.0: # %entry -; AVX-NEXT: pushq %rbx -; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: subq $48, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 64 -; AVX-NEXT: .cfi_offset %rbx, -16 -; AVX-NEXT: movq %rdi, %rbx +; AVX-NEXT: subq $72, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 80 +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: callq round@PLT -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; AVX-NEXT: # xmm0 = mem[0],zero ; AVX-NEXT: callq round@PLT -; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; AVX-NEXT: # xmm0 = mem[0],zero ; AVX-NEXT: vzeroupper ; AVX-NEXT: callq round@PLT ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: addq $48, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: popq %rbx +; AVX-NEXT: addq $72, %rsp ; AVX-NEXT: .cfi_def_cfa_offset 8 ; AVX-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll index 32ad72b2aa56a..5a0a476e830c1 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -999,17 +999,19 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; AVX1-LABEL: splatvar_funnnel_v16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] -; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpsrlw $1, %xmm5, %xmm5 -; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpsllw %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm6 +; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -1087,17 +1089,19 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; XOPAVX1-LABEL: splatvar_funnnel_v16i16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [15,0] -; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 -; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; XOPAVX1-NEXT: vpsrlw $1, %xmm5, %xmm5 -; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5 -; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; XOPAVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpor %xmm5, %xmm3, %xmm3 +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; XOPAVX1-NEXT: vpsllw %xmm4, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm6 +; XOPAVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; XOPAVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 +; XOPAVX1-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 +; XOPAVX1-NEXT: vpor %xmm3, %xmm5, %xmm3 +; XOPAVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpsrlw %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll index 1d807fa85ddc5..d1a77c0b543ab 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll @@ -544,6 +544,8 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v32i16: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0] ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5 @@ -562,6 +564,8 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0] ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm5 @@ -622,6 +626,8 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] +; AVX512F-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm6 +; AVX512F-NEXT: vpand %xmm6, %xmm2, %xmm2 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpsllw %xmm2, %ymm5, %ymm5 ; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5 @@ -641,24 +647,25 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; ; AVX512VL-LABEL: splatvar_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VL-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm3 = xmm3 & xmm2 & mem +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX512VL-NEXT: vpsllw %xmm2, %ymm5, %ymm5 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15],ymm4[24],ymm2[24],ymm4[25],ymm2[25],ymm4[26],ymm2[26],ymm4[27],ymm2[27],ymm4[28],ymm2[28],ymm4[29],ymm2[29],ymm4[30],ymm2[30],ymm4[31],ymm2[31] +; AVX512VL-NEXT: vpsllw %xmm3, %ymm5, %ymm5 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] -; AVX512VL-NEXT: vpsllw %xmm2, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512VL-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[16],ymm2[16],ymm4[17],ymm2[17],ymm4[18],ymm2[18],ymm4[19],ymm2[19],ymm4[20],ymm2[20],ymm4[21],ymm2[21],ymm4[22],ymm2[22],ymm4[23],ymm2[23] +; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512VL-NEXT: vpackuswb %ymm5, %ymm2, %ymm2 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX512VL-NEXT: vpsllw %xmm2, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsllw %xmm3, %ymm4, %ymm4 ; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512VL-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v64i8: diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll index 9c259ed38321d..58fb5ebc1d612 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -1034,6 +1034,8 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm6 +; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 @@ -1123,6 +1125,8 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm6 +; XOPAVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOPAVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll index 665223167fbb4..9c4c1ff789396 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -546,6 +546,8 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v32i16: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5 @@ -564,6 +566,8 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [15,0,0,0,15,0,0,0] ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm5 @@ -626,6 +630,8 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] +; AVX512F-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm6 +; AVX512F-NEXT: vpand %xmm6, %xmm2, %xmm2 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpsrlw %xmm2, %ymm5, %ymm5 ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] @@ -646,25 +652,26 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; ; AVX512VL-LABEL: splatvar_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VL-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm3 = xmm3 & xmm2 & mem +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] -; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX512VL-NEXT: vpsrlw %xmm2, %ymm5, %ymm5 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15],ymm4[24],ymm2[24],ymm4[25],ymm2[25],ymm4[26],ymm2[26],ymm4[27],ymm2[27],ymm4[28],ymm2[28],ymm4[29],ymm2[29],ymm4[30],ymm2[30],ymm4[31],ymm2[31] +; AVX512VL-NEXT: vpsrlw %xmm3, %ymm5, %ymm5 ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] -; AVX512VL-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 -; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512VL-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[16],ymm2[16],ymm4[17],ymm2[17],ymm4[18],ymm2[18],ymm4[19],ymm2[19],ymm4[20],ymm2[20],ymm4[21],ymm2[21],ymm4[22],ymm2[22],ymm4[23],ymm2[23] +; AVX512VL-NEXT: vpsrlw %xmm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512VL-NEXT: vpackuswb %ymm5, %ymm2, %ymm2 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX512VL-NEXT: vpsrlw %xmm2, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsrlw %xmm3, %ymm4, %ymm4 ; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw %xmm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpand %ymm6, %ymm0, %ymm0 ; AVX512VL-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v64i8: diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll index eafee9e65345f..07d15b834452a 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll @@ -216,25 +216,15 @@ define double @test_v2f64(<2 x double> %a0) { } define double @test_v3f64(<3 x double> %a0) { -; SSE2-LABEL: test_v3f64: -; SSE2: # %bb.0: -; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],mem[1] -; SSE2-NEXT: maxpd %xmm2, %xmm0 -; SSE2-NEXT: movapd %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: maxsd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v3f64: -; SSE41: # %bb.0: -; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],mem[1] -; SSE41-NEXT: maxpd %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE41-NEXT: maxsd %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test_v3f64: +; SSE: # %bb.0: +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movhpd {{.*#+}} xmm2 = xmm2[0],mem[0] +; SSE-NEXT: maxpd %xmm2, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: maxsd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test_v3f64: ; AVX: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll index 5ae9e552d0dcd..c21959e2fbabe 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll @@ -635,7 +635,7 @@ define double @test_v3f64(<3 x double> %a0) { ; SSE2-LABEL: test_v3f64: ; SSE2: # %bb.0: ; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],mem[1] +; SSE2-NEXT: movhpd {{.*#+}} xmm2 = xmm2[0],mem[0] ; SSE2-NEXT: movapd %xmm2, %xmm1 ; SSE2-NEXT: minpd %xmm0, %xmm1 ; SSE2-NEXT: cmpunordpd %xmm0, %xmm0 @@ -656,7 +656,7 @@ define double @test_v3f64(<3 x double> %a0) { ; SSE41-LABEL: test_v3f64: ; SSE41: # %bb.0: ; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],mem[1] +; SSE41-NEXT: movhpd {{.*#+}} xmm2 = xmm2[0],mem[0] ; SSE41-NEXT: movapd %xmm2, %xmm1 ; SSE41-NEXT: minpd %xmm0, %xmm1 ; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll index b114cba14cb6c..fb5df6933ccf4 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -660,29 +660,19 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { -; SSE2-LABEL: splatvar_rotate_v2i64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [64,64] -; SSE2-NEXT: psubq %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psllq %xmm1, %xmm3 -; SSE2-NEXT: psrlq %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_rotate_v2i64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq {{.*#+}} xmm2 = [64,64] -; SSE41-NEXT: psubq %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psllq %xmm1, %xmm3 -; SSE41-NEXT: psrlq %xmm2, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: splatvar_rotate_v2i64: +; SSE: # %bb.0: +; SSE-NEXT: movq {{.*#+}} xmm2 = [64,0] +; SSE-NEXT: psubq %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psllq %xmm1, %xmm3 +; SSE-NEXT: psrlq %xmm2, %xmm0 +; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: splatvar_rotate_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbq {{.*#+}} xmm2 = [64,64] +; AVX-NEXT: vmovq {{.*#+}} xmm2 = [64,0] ; AVX-NEXT: vpsubq %xmm1, %xmm2, %xmm2 ; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll index 86c4d79a28c89..49ad50a994515 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -517,7 +517,7 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-LABEL: splatvar_rotate_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [64,64] +; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [64,0] ; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpsllq %xmm1, %xmm3, %xmm4 @@ -532,7 +532,7 @@ define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX2-LABEL: splatvar_rotate_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm2 -; AVX2-NEXT: vpmovsxbq {{.*#+}} xmm3 = [64,64] +; AVX2-NEXT: vmovq {{.*#+}} xmm3 = [64,0] ; AVX2-NEXT: vpsubq %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll index 54056461bff8c..4373620d130eb 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -936,17 +936,19 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE-LABEL: splatvar_modulo_shift_v2i64: ; SSE: # %bb.0: -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; SSE-NEXT: psrlq %xmm1, %xmm2 -; SSE-NEXT: psrlq %xmm1, %xmm0 -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: psubq %xmm2, %xmm0 +; SSE-NEXT: movq {{.*#+}} xmm2 = [63,0] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; SSE-NEXT: psrlq %xmm2, %xmm1 +; SSE-NEXT: psrlq %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: psubq %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: splatvar_modulo_shift_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 @@ -957,7 +959,8 @@ define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwi ; ; AVX2-LABEL: splatvar_modulo_shift_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 @@ -967,7 +970,8 @@ define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwi ; ; XOPAVX1-LABEL: splatvar_modulo_shift_v2i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX1-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; XOPAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] @@ -976,8 +980,9 @@ define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwi ; ; XOPAVX2-LABEL: splatvar_modulo_shift_v2i64: ; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1 ; XOPAVX2-NEXT: vpshaq %xmm1, %xmm0, %xmm0 @@ -986,7 +991,8 @@ define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwi ; AVX512-LABEL: splatvar_modulo_shift_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpsraq %xmm1, %zmm0, %zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper @@ -1160,17 +1166,19 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; ; XOPAVX1-LABEL: splatvar_modulo_shift_v16i8: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; XOPAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_modulo_shift_v16i8: ; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 @@ -1178,9 +1186,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; ; AVX512DQ-LABEL: splatvar_modulo_shift_v16i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX512DQ-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512DQ-NEXT: vpsrad %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper @@ -1188,9 +1197,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; ; AVX512BW-LABEL: splatvar_modulo_shift_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512BW-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1199,9 +1209,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; ; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX512DQVL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512DQVL-NEXT: vpsrad %xmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper @@ -1209,9 +1220,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; ; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512BWVL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll index abd81a0e9f99a..5afb48bea3f50 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -1010,7 +1010,8 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-LABEL: splatvar_modulo_shift_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 @@ -1026,7 +1027,8 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi ; ; AVX2-LABEL: splatvar_modulo_shift_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 @@ -1036,7 +1038,8 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi ; ; XOPAVX1-LABEL: splatvar_modulo_shift_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX1-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; XOPAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] @@ -1048,7 +1051,8 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi ; ; XOPAVX2-LABEL: splatvar_modulo_shift_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 ; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 @@ -1059,7 +1063,8 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi ; AVX512-LABEL: splatvar_modulo_shift_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpsraq %xmm1, %zmm0, %zmm0 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512-NEXT: retq @@ -1265,9 +1270,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; ; XOPAVX1-LABEL: splatvar_modulo_shift_v32i8: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; XOPAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vpshab %xmm1, %xmm2, %xmm2 @@ -1277,8 +1283,9 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; ; XOPAVX2-LABEL: splatvar_modulo_shift_v32i8: ; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 @@ -1304,9 +1311,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; ; AVX512BW-LABEL: splatvar_modulo_shift_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 ; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq @@ -1327,9 +1335,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; ; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 -; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0 ; AVX512BWVL-NEXT: vpsraw %xmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll index 74dbee5e5d2ca..67ff078014e02 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll @@ -233,7 +233,8 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { define <8 x i64> @splatvar_modulo_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { ; ALL-LABEL: splatvar_modulo_shift_v8i64: ; ALL: # %bb.0: -; ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; ALL-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; ALL-NEXT: vpsraq %xmm1, %zmm0, %zmm0 ; ALL-NEXT: retq %mod = and <8 x i64> %b, diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll index 467c1574180da..1efbf0d0f0ca4 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -771,25 +771,29 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE-LABEL: splatvar_modulo_shift_v2i64: ; SSE: # %bb.0: -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: psrlq %xmm1, %xmm0 +; SSE-NEXT: movq {{.*#+}} xmm2 = [63,0] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: psrlq %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: splatvar_modulo_shift_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: splatvar_modulo_shift_v2i64: ; XOP: # %bb.0: -; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOP-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; XOP-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOP-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: splatvar_modulo_shift_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -941,17 +945,19 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; ; XOPAVX1-LABEL: splatvar_modulo_shift_v16i8: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; XOPAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_modulo_shift_v16i8: ; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 @@ -959,9 +965,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; ; AVX512DQ-LABEL: splatvar_modulo_shift_v16i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX512DQ-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512DQ-NEXT: vpsrld %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper @@ -969,9 +976,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; ; AVX512BW-LABEL: splatvar_modulo_shift_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -980,9 +988,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; ; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX512DQVL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512DQVL-NEXT: vpsrld %xmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper @@ -990,9 +999,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; ; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BWVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll index ca303b4c7ebf6..64db9c6d33f9b 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -823,8 +823,9 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-LABEL: splatvar_modulo_shift_v4i64: ; AVX1: # %bb.0: +; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -832,14 +833,16 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi ; ; AVX2-LABEL: splatvar_modulo_shift_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_modulo_shift_v4i64: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; XOPAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -847,13 +850,15 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi ; ; XOPAVX2-LABEL: splatvar_modulo_shift_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_modulo_shift_v4i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq ; @@ -1037,9 +1042,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; ; XOPAVX1-LABEL: splatvar_modulo_shift_v32i8: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; XOPAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm2, %xmm2 @@ -1049,8 +1055,9 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; ; XOPAVX2-LABEL: splatvar_modulo_shift_v32i8: ; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 @@ -1072,9 +1079,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; ; AVX512BW-LABEL: splatvar_modulo_shift_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq @@ -1092,9 +1100,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; ; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BWVL-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll index 103d5702fb93a..6640dfb13f4d1 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll @@ -188,7 +188,8 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { define <8 x i64> @splatvar_modulo_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { ; ALL-LABEL: splatvar_modulo_shift_v8i64: ; ALL: # %bb.0: -; ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; ALL-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; ALL-NEXT: vpsrlq %xmm1, %zmm0, %zmm0 ; ALL-NEXT: retq %mod = and <8 x i64> %b, diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll index 4dda9ff09cc62..8408179ebee07 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -678,25 +678,29 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE-LABEL: splatvar_modulo_shift_v2i64: ; SSE: # %bb.0: -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: psllq %xmm1, %xmm0 +; SSE-NEXT: movq {{.*#+}} xmm2 = [63,0] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: psllq %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: splatvar_modulo_shift_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; XOP-LABEL: splatvar_modulo_shift_v2i64: ; XOP: # %bb.0: -; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOP-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; XOP-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOP-NEXT: vpsllq %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512-LABEL: splatvar_modulo_shift_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpsllq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -848,24 +852,27 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; ; XOPAVX1-LABEL: splatvar_modulo_shift_v16i8: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; XOPAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splatvar_modulo_shift_v16i8: ; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; ; AVX512DQ-LABEL: splatvar_modulo_shift_v16i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX512DQ-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512DQ-NEXT: vpslld %xmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper @@ -873,9 +880,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; ; AVX512BW-LABEL: splatvar_modulo_shift_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -884,9 +892,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; ; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX512DQVL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512DQVL-NEXT: vpslld %xmm1, %zmm0, %zmm0 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper @@ -894,9 +903,10 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; ; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BWVL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll index c80f24ad57773..7f0e3388944e0 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -748,8 +748,9 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX1-LABEL: splatvar_modulo_shift_v4i64: ; AVX1: # %bb.0: +; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -757,14 +758,16 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi ; ; AVX2-LABEL: splatvar_modulo_shift_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_modulo_shift_v4i64: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; XOPAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -772,13 +775,15 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi ; ; XOPAVX2-LABEL: splatvar_modulo_shift_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; ; AVX512-LABEL: splatvar_modulo_shift_v4i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpsllq %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq ; @@ -962,9 +967,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; ; XOPAVX1-LABEL: splatvar_modulo_shift_v32i8: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; XOPAVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 @@ -973,9 +979,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; ; XOPAVX2-LABEL: splatvar_modulo_shift_v32i8: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; XOPAVX2-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; XOPAVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm2, %xmm2 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 @@ -993,9 +1000,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; ; AVX512BW-LABEL: splatvar_modulo_shift_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq @@ -1012,9 +1020,10 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; ; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BWVL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BWVL-NEXT: vpsllw %xmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll index a42056be895e7..540bab0cdc33a 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll @@ -181,7 +181,8 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { define <8 x i64> @splatvar_modulo_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { ; ALL-LABEL: splatvar_modulo_shift_v8i64: ; ALL: # %bb.0: -; ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; ALL-NEXT: vmovq {{.*#+}} xmm2 = [63,0] +; ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; ALL-NEXT: vpsllq %xmm1, %zmm0, %zmm0 ; ALL-NEXT: retq %mod = and <8 x i64> %b, diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll index 1d389f9817229..fdab5b797f3fa 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -1222,7 +1222,7 @@ define <2 x double> @insert_dup_mem_v2f64(ptr %ptr) { define <2 x double> @insert_dup_mem128_v2f64(ptr %ptr) nounwind { ; SSE2-LABEL: insert_dup_mem128_v2f64: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] ; SSE2-NEXT: retq ; @@ -1308,7 +1308,7 @@ define <2 x double> @shuffle_mem_v2f64_02(<2 x double> %a, ptr %pb) { ; ; AVX-LABEL: shuffle_mem_v2f64_02: ; AVX: # %bb.0: -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; AVX-NEXT: retq %b = load <2 x double>, ptr %pb, align 1 %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> @@ -1316,30 +1316,14 @@ define <2 x double> @shuffle_mem_v2f64_02(<2 x double> %a, ptr %pb) { } define <2 x double> @shuffle_mem_v2f64_21(<2 x double> %a, ptr %pb) { -; SSE2-LABEL: shuffle_mem_v2f64_21: -; SSE2: # %bb.0: -; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; SSE2-NEXT: retq -; -; SSE3-LABEL: shuffle_mem_v2f64_21: -; SSE3: # %bb.0: -; SSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; SSE3-NEXT: retq -; -; SSSE3-LABEL: shuffle_mem_v2f64_21: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_mem_v2f64_21: -; SSE41: # %bb.0: -; SSE41-NEXT: movups (%rdi), %xmm1 -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; SSE41-NEXT: retq +; SSE-LABEL: shuffle_mem_v2f64_21: +; SSE: # %bb.0: +; SSE-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_mem_v2f64_21: ; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; AVX-NEXT: vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; AVX-NEXT: retq %b = load <2 x double>, ptr %pb, align 1 %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll index a79b109feec72..428dffcdda576 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -2463,7 +2463,7 @@ define <4 x float> @shuffle_mem_v4f32_0145(<4 x float> %a, ptr %pb) { ; ; AVX-LABEL: shuffle_mem_v4f32_0145: ; AVX: # %bb.0: -; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; AVX-NEXT: retq %b = load <4 x float>, ptr %pb, align 1 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> @@ -2471,30 +2471,14 @@ define <4 x float> @shuffle_mem_v4f32_0145(<4 x float> %a, ptr %pb) { } define <4 x float> @shuffle_mem_v4f32_4523(<4 x float> %a, ptr %pb) { -; SSE2-LABEL: shuffle_mem_v4f32_4523: -; SSE2: # %bb.0: -; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; SSE2-NEXT: retq -; -; SSE3-LABEL: shuffle_mem_v4f32_4523: -; SSE3: # %bb.0: -; SSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; SSE3-NEXT: retq -; -; SSSE3-LABEL: shuffle_mem_v4f32_4523: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_mem_v4f32_4523: -; SSE41: # %bb.0: -; SSE41-NEXT: movups (%rdi), %xmm1 -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; SSE41-NEXT: retq +; SSE-LABEL: shuffle_mem_v4f32_4523: +; SSE: # %bb.0: +; SSE-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_mem_v4f32_4523: ; AVX: # %bb.0: -; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; AVX-NEXT: vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; AVX-NEXT: retq %b = load <4 x float>, ptr %pb, align 1 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> @@ -2525,23 +2509,46 @@ define <4 x float> @shuffle_mem_v4f32_0624(<4 x float> %a0, ptr %a1) { } define <4 x float> @shuffle_mem_v4f32_4760(<4 x float> %a0, ptr %a1) { -; SSE-LABEL: shuffle_mem_v4f32_4760: -; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,2] -; SSE-NEXT: retq +; SSE2-LABEL: shuffle_mem_v4f32_4760: +; SSE2: # %bb.0: +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_mem_v4f32_4760: +; SSE3: # %bb.0: +; SSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_mem_v4f32_4760: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_mem_v4f32_4760: +; SSE41: # %bb.0: +; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0] +; SSE41-NEXT: retq ; ; AVX1OR2-LABEL: shuffle_mem_v4f32_4760: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,0],mem[0,0] -; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,2] +; AVX1OR2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[2,0] ; AVX1OR2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_mem_v4f32_4760: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,3,2,4] -; AVX512VL-NEXT: vpermt2ps (%rdi), %xmm1, %xmm0 +; AVX512VL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,3,2,4] +; AVX512VL-NEXT: vpermt2ps %xmm1, %xmm2, %xmm0 ; AVX512VL-NEXT: retq %1 = load <4 x float>, ptr %a1 %2 = shufflevector <4 x float> %1, <4 x float> %a0, <4 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll index ec54b75513582..feefa8fb875e5 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -914,16 +914,15 @@ define void @PR63030(ptr %p0) { ; ; X64-AVX2-LABEL: PR63030: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovaps (%rdi), %xmm0 -; X64-AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [3,3] -; X64-AVX2-NEXT: # xmm1 = mem[0,0] -; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[1,1,0,0] -; X64-AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; X64-AVX2-NEXT: vmovaps {{.*#+}} xmm2 = [3,2] -; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,1] -; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] -; X64-AVX2-NEXT: vmovaps %ymm0, (%rax) -; X64-AVX2-NEXT: vmovaps %ymm1, (%rax) +; X64-AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] +; X64-AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[1,1,0,0] +; X64-AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] +; X64-AVX2-NEXT: vpinsrq $1, {{\.?LCPI[0-9]+_[0-9]+}}+8(%rip), %xmm0, %xmm2 +; X64-AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,1] +; X64-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7] +; X64-AVX2-NEXT: vmovdqa %ymm0, (%rax) +; X64-AVX2-NEXT: vmovdqa %ymm1, (%rax) ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll index 5c035346415b0..514523efef2a9 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -796,21 +796,26 @@ define <16 x i8> @constant_fold_pshufb_2() { define i32 @mask_zzz3_v16i8(<16 x i8> %a0) { ; SSSE3-LABEL: mask_zzz3_v16i8: ; SSSE3: # %bb.0: -; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: movzbl {{\.?LCPI[0-9]+_[0-9]+}}+3(%rip), %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: pslld $24, %xmm1 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 ; SSSE3-NEXT: movd %xmm0, %eax ; SSSE3-NEXT: andl $-16777216, %eax # imm = 0xFF000000 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: mask_zzz3_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: psllw $8, %xmm0 +; SSE41-NEXT: pinsrb $15, {{\.?LCPI[0-9]+_[0-9]+}}+15(%rip), %xmm1 +; SSE41-NEXT: pshufb %xmm1, %xmm0 ; SSE41-NEXT: pextrd $3, %xmm0, %eax ; SSE41-NEXT: andl $-16777216, %eax # imm = 0xFF000000 ; SSE41-NEXT: retq ; ; AVX-LABEL: mask_zzz3_v16i8: ; AVX: # %bb.0: -; AVX-NEXT: vpsllw $8, %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $15, {{\.?LCPI[0-9]+_[0-9]+}}+15(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpextrd $3, %xmm0, %eax ; AVX-NEXT: andl $-16777216, %eax # imm = 0xFF000000 ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index 07c770abc65d6..a7210cffc80c0 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -1749,13 +1749,21 @@ define <4 x i8> @combine_test1c(ptr %a, ptr %b) { ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test1c: -; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vmovd {{.*#+}} xmm2 = [0,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_test1c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test1c: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0 +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm2 = [0,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %A = load <4 x i8>, ptr %a %B = load <4 x i8>, ptr %b %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> @@ -1810,17 +1818,21 @@ define <4 x i8> @combine_test3c(ptr %a, ptr %b) { define <4 x i8> @combine_test4c(ptr %a, ptr %b) { ; SSE2-LABEL: combine_test4c: ; SSE2: # %bb.0: -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: andps %xmm0, %xmm2 -; SSE2-NEXT: andnps %xmm1, %xmm0 -; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: movzbl 1(%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: psllw $8, %xmm1 +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_test4c: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movzbl 1(%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: psllw $8, %xmm1 ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,3,4,6,u,u,u,u,u,u,u,u,u,u,u,u] @@ -1828,20 +1840,32 @@ define <4 x i8> @combine_test4c(ptr %a, ptr %b) { ; ; SSE41-LABEL: combine_test4c: ; SSE41: # %bb.0: -; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: movzbl 1(%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm1 +; SSE41-NEXT: psllw $8, %xmm1 ; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE41-NEXT: movss {{.*#+}} xmm0 = [255,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0] ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test4c: -; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vmovd {{.*#+}} xmm2 = [255,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_test4c: +; AVX1: # %bb.0: +; AVX1-NEXT: movzbl 1(%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [255,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test4c: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb 1(%rdi), %xmm0 +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm2 = [255,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %A = load <4 x i8>, ptr %a %B = load <4 x i8>, ptr %b %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> @@ -2821,16 +2845,16 @@ define <4 x float> @PR30264(<4 x float> %x) { ; SSE2: # %bb.0: ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],mem[1] -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: PR30264: ; SSSE3: # %bb.0: ; SSSE3-NEXT: xorps %xmm1, %xmm1 ; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],mem[1] -; SSSE3-NEXT: movapd %xmm1, %xmm0 +; SSSE3-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: PR30264: @@ -3051,17 +3075,18 @@ define <8 x i16> @shuffle_scalar_to_vector_extract(ptr %p0, ptr %p1, ptr %p2) { ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: psraw $8, %xmm1 ; SSE2-NEXT: pextrw $7, %xmm1, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: movsbl (%rsi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movsbl (%rdx), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: movd {{.*#+}} xmm1 = [65531,0,0,0] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movsbl (%rsi), %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: movsbl (%rdx), %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_scalar_to_vector_extract: @@ -3077,7 +3102,8 @@ define <8 x i16> @shuffle_scalar_to_vector_extract(ptr %p0, ptr %p1, ptr %p2) { ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSSE3-NEXT: pxor %xmm0, %xmm0 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSSE3-NEXT: movd {{.*#+}} xmm1 = [65531,0,0,0] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-NEXT: retq ; @@ -3555,14 +3581,15 @@ define void @SpinningCube() { ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 ; SSE2-NEXT: movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0] -; SSE2-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; SSE2-NEXT: movapd {{.*#+}} xmm2 = [u,u,-2.0E+0,u] -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE2-NEXT: xorps %xmm3, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] -; SSE2-NEXT: addps %xmm3, %xmm1 -; SSE2-NEXT: movaps %xmm1, (%rax) +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; SSE2-NEXT: movaps %xmm2, %xmm3 +; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,3] +; SSE2-NEXT: addps %xmm1, %xmm2 +; SSE2-NEXT: movaps %xmm2, (%rax) ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -3574,14 +3601,15 @@ define void @SpinningCube() { ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 ; SSSE3-NEXT: movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0] -; SSSE3-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; SSSE3-NEXT: movapd {{.*#+}} xmm2 = [u,u,-2.0E+0,u] -; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSSE3-NEXT: xorps %xmm3, %xmm3 -; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] -; SSSE3-NEXT: addps %xmm3, %xmm1 -; SSSE3-NEXT: movaps %xmm1, (%rax) +; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; SSSE3-NEXT: movaps %xmm2, %xmm3 +; SSSE3-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,3] +; SSSE3-NEXT: addps %xmm1, %xmm2 +; SSSE3-NEXT: movaps %xmm2, (%rax) ; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,2] ; SSSE3-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll index 2b89590a0bb41..ecdea22d7b5a4 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll @@ -42,7 +42,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) { ; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551615,0] +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = [18446744073709551615,0] ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 @@ -56,7 +56,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) { ; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z} -; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm2 = [18446744073709551615,0] +; AVX512VL-NEXT: vmovq {{.*#+}} xmm2 = [18446744073709551615,0] ; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} @@ -67,7 +67,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) { ; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 -; VL_BW_DQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551615,0] +; VL_BW_DQ-NEXT: vmovq {{.*#+}} xmm1 = [18446744073709551615,0] ; VL_BW_DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll index e83c1e8482773..fea59d9657612 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll @@ -69,7 +69,7 @@ define <64 x i8> @f1(ptr %p0) { ; AVX512F-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,17,19,23,25,29,31,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (mem & (ymm2 ^ ymm0)) ; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX512F-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm4 @@ -83,7 +83,7 @@ define <64 x i8> @f1(ptr %p0) { ; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,5,7,11,13,17,19,23,25,29,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm1 & mem) ; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -214,7 +214,7 @@ define <64 x i8> @f2(ptr %p0) { ; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm0 & mem) ; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX512F-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm4 @@ -228,7 +228,7 @@ define <64 x i8> @f2(ptr %p0) { ; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: f2: @@ -344,7 +344,7 @@ define <64 x i8> @f3(ptr %p0) { ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpmovsxwd {{.*#+}} ymm5 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] -; AVX512F-NEXT: vpternlogq $216, %ymm5, %ymm2, %ymm0 +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm5 & (ymm0 ^ ymm2)) ; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm6 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u] ; AVX512F-NEXT: vpshufb %xmm7, %xmm6, %xmm6 @@ -369,7 +369,7 @@ define <64 x i8> @f3(ptr %p0) { ; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vpternlogq $226, %ymm1, %ymm5, %ymm2 +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm5 & (ymm2 ^ ymm1)) ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; @@ -497,7 +497,7 @@ define <64 x i8> @f4(ptr %p0) { ; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm0 & mem) ; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm0 ; AVX512F-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm4 @@ -511,7 +511,7 @@ define <64 x i8> @f4(ptr %p0) { ; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: f4: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll index 0efbe018764d2..ec81ecf6faa85 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll @@ -35,7 +35,7 @@ define <32 x i8> @foo(ptr %x0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqu 32(%rdi), %xmm0 ; AVX2-NEXT: vmovdqu (%rdi), %ymm1 -; AVX2-NEXT: vmovdqu 16(%rdi), %xmm2 +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,2,3,5,6] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] @@ -53,7 +53,7 @@ define <32 x i8> @foo(ptr %x0) { ; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14] ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqu 16(%rdi), %xmm2 +; AVX512F-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[0,2,3,5,6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,3,4,6,7,9,10,12,13,15],zero,zero,zero,zero,zero,ymm0[24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 @@ -65,7 +65,7 @@ define <32 x i8> @foo(ptr %x0) { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqu 32(%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqu (%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqu 16(%rdi), %xmm2 +; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,2,3,5,6] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: movl $63488, %eax # imm = 0xF800 diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll index be6ee8f689958..494d216dfa0ed 100644 --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -761,19 +761,31 @@ define void @vselect_allzeros_LHS_multiple_use_setcc(<4 x i32> %x, <4 x i32> %y, ; This test case previously crashed after r363802, r363850, and r363856 due ; any_extend_vector_inreg not being handled by the X86 backend. define i64 @vselect_any_extend_vector_inreg_crash(ptr %x) { -; SSE-LABEL: vselect_any_extend_vector_inreg_crash: -; SSE: # %bb.0: -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: andl $1, %eax -; SSE-NEXT: shll $15, %eax -; SSE-NEXT: retq +; SSE2-LABEL: vselect_any_extend_vector_inreg_crash: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = [49,0,0,0] +; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: shll $15, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: vselect_any_extend_vector_inreg_crash: +; SSE41: # %bb.0: +; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE41-NEXT: pinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: shll $15, %eax +; SSE41-NEXT: retq ; ; AVX1-LABEL: vselect_any_extend_vector_inreg_crash: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: shll $15, %eax diff --git a/llvm/test/CodeGen/X86/widened-broadcast.ll b/llvm/test/CodeGen/X86/widened-broadcast.ll index 8e47ed67bdcff..3c8d358413eca 100644 --- a/llvm/test/CodeGen/X86/widened-broadcast.ll +++ b/llvm/test/CodeGen/X86/widened-broadcast.ll @@ -140,23 +140,14 @@ entry: define <8 x i16> @load_splat_8i16_8i16_01010101(ptr %ptr) nounwind uwtable readnone ssp { ; SSE-LABEL: load_splat_8i16_8i16_01010101: ; SSE: # %bb.0: # %entry -; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0] +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; -; AVX1-LABEL: load_splat_8i16_8i16_01010101: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,0,0,0] -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_splat_8i16_8i16_01010101: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vbroadcastss (%rdi), %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_splat_8i16_8i16_01010101: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vbroadcastss (%rdi), %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: load_splat_8i16_8i16_01010101: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX-NEXT: retq entry: %ld = load <8 x i16>, ptr %ptr %ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <8 x i32> @@ -192,7 +183,8 @@ entry: define <16 x i16> @load_splat_16i16_8i16_0101010101010101(ptr %ptr) nounwind uwtable readnone ssp { ; SSE-LABEL: load_splat_16i16_8i16_0101010101010101: ; SSE: # %bb.0: # %entry -; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0] +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; @@ -226,7 +218,8 @@ entry: define <16 x i16> @load_splat_16i16_16i16_0101010101010101(ptr %ptr) nounwind uwtable readnone ssp { ; SSE-LABEL: load_splat_16i16_16i16_0101010101010101: ; SSE: # %bb.0: # %entry -; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0] +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; @@ -288,23 +281,14 @@ entry: define <16 x i8> @load_splat_16i8_16i8_0123012301230123(ptr %ptr) nounwind uwtable readnone ssp { ; SSE-LABEL: load_splat_16i8_16i8_0123012301230123: ; SSE: # %bb.0: # %entry -; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0] +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; -; AVX1-LABEL: load_splat_16i8_16i8_0123012301230123: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,0,0,0] -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_splat_16i8_16i8_0123012301230123: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vbroadcastss (%rdi), %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_splat_16i8_16i8_0123012301230123: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vbroadcastss (%rdi), %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: load_splat_16i8_16i8_0123012301230123: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX-NEXT: retq entry: %ld = load <16 x i8>, ptr %ptr %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <16 x i32> @@ -347,7 +331,8 @@ define <32 x i8> @load_splat_32i8_16i8_01010101010101010101010101010101(ptr %ptr ; ; AVX1-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -370,7 +355,8 @@ entry: define <32 x i8> @load_splat_32i8_16i8_01230123012301230123012301230123(ptr %ptr) nounwind uwtable readnone ssp { ; SSE-LABEL: load_splat_32i8_16i8_01230123012301230123012301230123: ; SSE: # %bb.0: # %entry -; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0] +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; @@ -411,7 +397,8 @@ define <32 x i8> @load_splat_32i8_32i8_01010101010101010101010101010101(ptr %ptr ; ; AVX1-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -434,7 +421,8 @@ entry: define <32 x i8> @load_splat_32i8_32i8_01230123012301230123012301230123(ptr %ptr) nounwind uwtable readnone ssp { ; SSE-LABEL: load_splat_32i8_32i8_01230123012301230123012301230123: ; SSE: # %bb.0: # %entry -; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0] +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; @@ -468,7 +456,7 @@ entry: define <4 x float> @load_splat_4f32_8f32_0000(ptr %ptr) nounwind uwtable readnone ssp { ; SSE-LABEL: load_splat_4f32_8f32_0000: ; SSE: # %bb.0: # %entry -; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index 3e76bffb77a66..91831d2326bbb 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -53,10 +53,12 @@ define <4 x double> @load_factorf64_2(ptr %ptr) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovupd (%rdi), %ymm0 ; AVX1-NEXT: vmovupd 32(%rdi), %ymm1 -; AVX1-NEXT: vmovupd 64(%rdi), %ymm2 -; AVX1-NEXT: vmovupd 96(%rdi), %ymm3 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 +; AVX1-NEXT: vbroadcastsd 88(%rdi), %ymm2 +; AVX1-NEXT: vbroadcastsd 120(%rdi), %ymm3 +; AVX1-NEXT: vmovsd 64(%rdi), %xmm4 # xmm4 = mem[0],zero +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX1-NEXT: vmovsd 96(%rdi), %xmm5 # xmm5 = mem[0],zero +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm5 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] @@ -88,21 +90,21 @@ define <4 x double> @load_factorf64_2(ptr %ptr) nounwind { define <4 x double> @load_factorf64_1(ptr %ptr) nounwind { ; AVX1-LABEL: load_factorf64_1: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovups (%rdi), %ymm0 -; AVX1-NEXT: vmovups 32(%rdi), %ymm1 -; AVX1-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0 -; AVX1-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm1 -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-NEXT: vmovsd (%rdi), %xmm0 # xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd 64(%rdi), %xmm1 # xmm1 = mem[0],zero +; AVX1-NEXT: vmovhps 96(%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[0,1] +; AVX1-NEXT: vmovhps 32(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vmulpd %ymm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2OR512-LABEL: load_factorf64_1: ; AVX2OR512: # %bb.0: -; AVX2OR512-NEXT: vmovupd (%rdi), %ymm0 -; AVX2OR512-NEXT: vmovupd 32(%rdi), %ymm1 -; AVX2OR512-NEXT: vperm2f128 $32, 64(%rdi), %ymm0, %ymm0 # ymm0 = ymm0[0,1],mem[0,1] -; AVX2OR512-NEXT: vperm2f128 $32, 96(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0,1],mem[0,1] -; AVX2OR512-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2OR512-NEXT: vmovsd (%rdi), %xmm0 # xmm0 = mem[0],zero +; AVX2OR512-NEXT: vmovsd 64(%rdi), %xmm1 # xmm1 = mem[0],zero +; AVX2OR512-NEXT: vmovhpd 96(%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0] +; AVX2OR512-NEXT: vmovhpd 32(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0],mem[0] +; AVX2OR512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2OR512-NEXT: vmulpd %ymm0, %ymm0, %ymm0 ; AVX2OR512-NEXT: retq %wide.vec = load <16 x double>, ptr %ptr, align 16 @@ -1873,8 +1875,9 @@ define void @splat4_v4i64_load_store(ptr %s, ptr %d) nounwind { define <2 x i64> @PR37616(ptr %a0) nounwind { ; AVX-LABEL: PR37616: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps 16(%rdi), %xmm0 -; AVX-NEXT: vunpcklpd 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovsd 48(%rdi), %xmm0 # xmm0 = mem[0],zero +; AVX-NEXT: vmovsd 16(%rdi), %xmm1 # xmm1 = mem[0],zero +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX-NEXT: retq %load = load <16 x i64>, ptr %a0, align 128 %shuffle = shufflevector <16 x i64> %load, <16 x i64> undef, <2 x i32> diff --git a/llvm/test/CodeGen/X86/xop-shifts.ll b/llvm/test/CodeGen/X86/xop-shifts.ll index 83dcf9ce0d1e9..1512a488846a8 100644 --- a/llvm/test/CodeGen/X86/xop-shifts.ll +++ b/llvm/test/CodeGen/X86/xop-shifts.ll @@ -8,9 +8,12 @@ define <16 x i8> @demandedelts_vpshab(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: demandedelts_vpshab: ; CHECK: # %bb.0: +; CHECK-NEXT: vpinsrb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vpshab %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %shuffle = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> %shift = call <16 x i8> @llvm.x86.xop.vpshab(<16 x i8> %shuffle, <16 x i8> %a1) diff --git a/llvm/test/CodeGen/X86/xor.ll b/llvm/test/CodeGen/X86/xor.ll index 2bef66825d8c0..f5c879b2011cd 100644 --- a/llvm/test/CodeGen/X86/xor.ll +++ b/llvm/test/CodeGen/X86/xor.ll @@ -405,8 +405,11 @@ define i32 @PR17487(i1 %tobool) { ; X64-LIN: # %bb.0: ; X64-LIN-NEXT: movd %edi, %xmm0 ; X64-LIN-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; X64-LIN-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-LIN-NEXT: pextrw $4, %xmm0, %eax +; X64-LIN-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X64-LIN-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; X64-LIN-NEXT: pand %xmm0, %xmm1 +; X64-LIN-NEXT: pextrw $4, %xmm1, %eax +; X64-LIN-NEXT: movzbl %al, %eax ; X64-LIN-NEXT: retq ; ; X64-WIN-LABEL: PR17487: @@ -414,8 +417,11 @@ define i32 @PR17487(i1 %tobool) { ; X64-WIN-NEXT: movzbl %cl, %eax ; X64-WIN-NEXT: movd %eax, %xmm0 ; X64-WIN-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; X64-WIN-NEXT: pand __xmm@00000000000000010000000000000001(%rip), %xmm0 -; X64-WIN-NEXT: pextrw $4, %xmm0, %eax +; X64-WIN-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X64-WIN-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; X64-WIN-NEXT: pand %xmm0, %xmm1 +; X64-WIN-NEXT: pextrw $4, %xmm1, %eax +; X64-WIN-NEXT: movzbl %al, %eax ; X64-WIN-NEXT: retq %tmp = insertelement <2 x i1> undef, i1 %tobool, i32 1 %tmp1 = zext <2 x i1> %tmp to <2 x i64>