From 2ec06c7094cff66d31d05e07565cdfa937b03494 Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Thu, 5 Dec 2024 18:59:26 +0800 Subject: [PATCH 1/6] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?= =?UTF-8?q?itial=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.6-beta.1 --- llvm/include/llvm/CodeGen/RegisterClassInfo.h | 7 +- .../include/llvm/CodeGen/TargetRegisterInfo.h | 9 +- llvm/lib/CodeGen/MachinePipeliner.cpp | 41 - llvm/lib/CodeGen/RegisterClassInfo.cpp | 37 - llvm/lib/CodeGen/TargetRegisterInfo.cpp | 44 + llvm/test/CodeGen/LoongArch/jr-without-ra.ll | 112 +-- llvm/test/CodeGen/NVPTX/misched_func_call.ll | 7 +- llvm/test/CodeGen/PowerPC/aix-csr-alloc.mir | 1 - llvm/test/CodeGen/PowerPC/aix64-csr-alloc.mir | 1 - .../CodeGen/PowerPC/compute-regpressure.ll | 4 +- .../RISCV/rvv/vxrm-insert-out-of-loop.ll | 5 +- .../test/CodeGen/Thumb2/mve-blockplacement.ll | 124 ++- .../CodeGen/Thumb2/mve-gather-increment.ll | 788 +++++++++--------- .../Thumb2/mve-gather-scatter-optimisation.ll | 140 ++-- llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll | 75 +- llvm/test/CodeGen/X86/avx512-regcall-Mask.ll | 4 +- .../test/CodeGen/X86/avx512-regcall-NoMask.ll | 8 +- llvm/test/CodeGen/X86/sse-regcall.ll | 8 +- llvm/test/CodeGen/X86/sse-regcall4.ll | 8 +- .../subvectorwise-store-of-vector-splat.ll | 335 ++++---- ...unfold-masked-merge-vector-variablemask.ll | 556 ++++++------ .../CodeGen/X86/x86-64-flags-intrinsics.ll | 16 +- llvm/test/TableGen/bare-minimum-psets.td | 2 +- llvm/test/TableGen/inhibit-pset.td | 2 +- llvm/unittests/CodeGen/MFCommon.inc | 4 +- llvm/utils/TableGen/RegisterInfoEmitter.cpp | 7 +- 26 files changed, 1155 insertions(+), 1190 deletions(-) diff --git a/llvm/include/llvm/CodeGen/RegisterClassInfo.h b/llvm/include/llvm/CodeGen/RegisterClassInfo.h index 800bebea0dddb..417a1e40d02b9 100644 --- a/llvm/include/llvm/CodeGen/RegisterClassInfo.h +++ b/llvm/include/llvm/CodeGen/RegisterClassInfo.h @@ -141,16 +141,11 @@ class RegisterClassInfo { } /// Get the register unit limit for the given pressure set index. - /// - /// RegisterClassInfo adjusts this limit for reserved registers. unsigned getRegPressureSetLimit(unsigned Idx) const { if (!PSetLimits[Idx]) - PSetLimits[Idx] = computePSetLimit(Idx); + PSetLimits[Idx] = TRI->getRegPressureSetLimit(*MF, Idx); return PSetLimits[Idx]; } - -protected: - unsigned computePSetLimit(unsigned Idx) const; }; } // end namespace llvm diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h index 292fa3c94969b..f7cd7cfe1aa15 100644 --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -913,9 +913,14 @@ class TargetRegisterInfo : public MCRegisterInfo { virtual const char *getRegPressureSetName(unsigned Idx) const = 0; /// Get the register unit pressure limit for this dimension. - /// This limit must be adjusted dynamically for reserved registers. + /// TargetRegisterInfo adjusts this limit for reserved registers. virtual unsigned getRegPressureSetLimit(const MachineFunction &MF, - unsigned Idx) const = 0; + unsigned Idx) const; + + /// Get the raw register unit pressure limit for this dimension. + /// This limit must be adjusted dynamically for reserved registers. + virtual unsigned getRawRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const = 0; /// Get the dimensions of register pressure impacted by this register class. /// Returns a -1 terminated array of pressure set IDs. diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index 7a10bd39e2695..3ee0ba1fea507 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -1327,47 +1327,6 @@ class HighRegisterPressureDetector { void computePressureSetLimit(const RegisterClassInfo &RCI) { for (unsigned PSet = 0; PSet < PSetNum; PSet++) PressureSetLimit[PSet] = TRI->getRegPressureSetLimit(MF, PSet); - - // We assume fixed registers, such as stack pointer, are already in use. - // Therefore subtracting the weight of the fixed registers from the limit of - // each pressure set in advance. - SmallDenseSet FixedRegs; - for (const TargetRegisterClass *TRC : TRI->regclasses()) { - for (const MCPhysReg Reg : *TRC) - if (isFixedRegister(Reg)) - FixedRegs.insert(Reg); - } - - LLVM_DEBUG({ - for (auto Reg : FixedRegs) { - dbgs() << printReg(Reg, TRI, 0, &MRI) << ": ["; - for (MCRegUnit Unit : TRI->regunits(Reg)) { - const int *Sets = TRI->getRegUnitPressureSets(Unit); - for (; *Sets != -1; Sets++) { - dbgs() << TRI->getRegPressureSetName(*Sets) << ", "; - } - } - dbgs() << "]\n"; - } - }); - - for (auto Reg : FixedRegs) { - LLVM_DEBUG(dbgs() << "fixed register: " << printReg(Reg, TRI, 0, &MRI) - << "\n"); - for (MCRegUnit Unit : TRI->regunits(Reg)) { - auto PSetIter = MRI.getPressureSets(Unit); - unsigned Weight = PSetIter.getWeight(); - for (; PSetIter.isValid(); ++PSetIter) { - unsigned &Limit = PressureSetLimit[*PSetIter]; - assert( - Limit >= Weight && - "register pressure limit must be greater than or equal weight"); - Limit -= Weight; - LLVM_DEBUG(dbgs() << "PSet=" << *PSetIter << " Limit=" << Limit - << " (decreased by " << Weight << ")\n"); - } - } - } } // There are two patterns of last-use. diff --git a/llvm/lib/CodeGen/RegisterClassInfo.cpp b/llvm/lib/CodeGen/RegisterClassInfo.cpp index 9312bc03bc522..976d41a54da56 100644 --- a/llvm/lib/CodeGen/RegisterClassInfo.cpp +++ b/llvm/lib/CodeGen/RegisterClassInfo.cpp @@ -195,40 +195,3 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const { // RCI is now up-to-date. RCI.Tag = Tag; } - -/// This is not accurate because two overlapping register sets may have some -/// nonoverlapping reserved registers. However, computing the allocation order -/// for all register classes would be too expensive. -unsigned RegisterClassInfo::computePSetLimit(unsigned Idx) const { - const TargetRegisterClass *RC = nullptr; - unsigned NumRCUnits = 0; - for (const TargetRegisterClass *C : TRI->regclasses()) { - const int *PSetID = TRI->getRegClassPressureSets(C); - for (; *PSetID != -1; ++PSetID) { - if ((unsigned)*PSetID == Idx) - break; - } - if (*PSetID == -1) - continue; - - // Found a register class that counts against this pressure set. - // For efficiency, only compute the set order for the largest set. - unsigned NUnits = TRI->getRegClassWeight(C).WeightLimit; - if (!RC || NUnits > NumRCUnits) { - RC = C; - NumRCUnits = NUnits; - } - } - assert(RC && "Failed to find register class"); - compute(RC); - unsigned NAllocatableRegs = getNumAllocatableRegs(RC); - unsigned RegPressureSetLimit = TRI->getRegPressureSetLimit(*MF, Idx); - // If all the regs are reserved, return raw RegPressureSetLimit. - // One example is VRSAVERC in PowerPC. - // Avoid returning zero, getRegPressureSetLimit(Idx) assumes computePSetLimit - // return non-zero value. - if (NAllocatableRegs == 0) - return RegPressureSetLimit; - unsigned NReserved = RC->getNumRegs() - NAllocatableRegs; - return RegPressureSetLimit - TRI->getRegClassWeight(RC).RegWeight * NReserved; -} diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp index 032f1a33e75c4..4cede283a7232 100644 --- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp +++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp @@ -674,6 +674,50 @@ TargetRegisterInfo::prependOffsetExpression(const DIExpression *Expr, PrependFlags & DIExpression::EntryValue); } +unsigned TargetRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const { + const TargetRegisterClass *RC = nullptr; + unsigned NumRCUnits = 0; + for (const TargetRegisterClass *C : regclasses()) { + const int *PSetID = getRegClassPressureSets(C); + for (; *PSetID != -1; ++PSetID) { + if ((unsigned)*PSetID == Idx) + break; + } + if (*PSetID == -1) + continue; + + // Found a register class that counts against this pressure set. + // For efficiency, only compute the set order for the largest set. + unsigned NUnits = getRegClassWeight(C).WeightLimit; + if (!RC || NUnits > NumRCUnits) { + RC = C; + NumRCUnits = NUnits; + } + } + assert(RC && "Failed to find register class"); + + unsigned NReserved = 0; + const BitVector Reserved = MF.getRegInfo().getReservedRegs(); + for (unsigned PhysReg : RC->getRawAllocationOrder(MF)) + if (Reserved.test(PhysReg)) + NReserved++; + + unsigned NAllocatableRegs = RC->getNumRegs() - NReserved; + unsigned RegPressureSetLimit = getRawRegPressureSetLimit(MF, Idx); + // If all the regs are reserved, return raw RegPressureSetLimit. + // One example is VRSAVERC in PowerPC. + // Avoid returning zero, RegisterClassInfo::getRegPressureSetLimit(Idx) + // assumes this returns non-zero value. + if (NAllocatableRegs == 0) { + LLVM_DEBUG({ + dbgs() << "All registers of " << getRegClassName(RC) << " are reserved!"; + }); + return RegPressureSetLimit; + } + return RegPressureSetLimit - getRegClassWeight(RC).RegWeight * NReserved; +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void TargetRegisterInfo::dumpReg(Register Reg, unsigned SubRegIndex, diff --git a/llvm/test/CodeGen/LoongArch/jr-without-ra.ll b/llvm/test/CodeGen/LoongArch/jr-without-ra.ll index d1c4459aaa6ee..2bd89dacb2b37 100644 --- a/llvm/test/CodeGen/LoongArch/jr-without-ra.ll +++ b/llvm/test/CodeGen/LoongArch/jr-without-ra.ll @@ -20,101 +20,101 @@ define void @jr_without_ra(ptr %rtwdev, ptr %chan, ptr %h2c, i8 %.pre, i1 %cmp.i ; CHECK-NEXT: st.d $s6, $sp, 24 # 8-byte Folded Spill ; CHECK-NEXT: st.d $s7, $sp, 16 # 8-byte Folded Spill ; CHECK-NEXT: st.d $s8, $sp, 8 # 8-byte Folded Spill -; CHECK-NEXT: move $s7, $zero -; CHECK-NEXT: move $s0, $zero +; CHECK-NEXT: move $s6, $zero +; CHECK-NEXT: move $s1, $zero ; CHECK-NEXT: ld.d $t0, $sp, 184 -; CHECK-NEXT: ld.d $s2, $sp, 176 -; CHECK-NEXT: ld.d $s1, $sp, 168 -; CHECK-NEXT: ld.d $t1, $sp, 160 -; CHECK-NEXT: ld.d $t2, $sp, 152 -; CHECK-NEXT: ld.d $t3, $sp, 144 -; CHECK-NEXT: ld.d $t4, $sp, 136 -; CHECK-NEXT: ld.d $t5, $sp, 128 -; CHECK-NEXT: ld.d $t6, $sp, 120 -; CHECK-NEXT: ld.d $t7, $sp, 112 -; CHECK-NEXT: ld.d $t8, $sp, 104 -; CHECK-NEXT: ld.d $fp, $sp, 96 +; CHECK-NEXT: ld.d $t1, $sp, 176 +; CHECK-NEXT: ld.d $s2, $sp, 168 +; CHECK-NEXT: ld.d $t2, $sp, 160 +; CHECK-NEXT: ld.d $t3, $sp, 152 +; CHECK-NEXT: ld.d $t4, $sp, 144 +; CHECK-NEXT: ld.d $t5, $sp, 136 +; CHECK-NEXT: ld.d $t6, $sp, 128 +; CHECK-NEXT: ld.d $t7, $sp, 120 +; CHECK-NEXT: ld.d $t8, $sp, 112 +; CHECK-NEXT: ld.d $fp, $sp, 104 +; CHECK-NEXT: ld.d $s0, $sp, 96 ; CHECK-NEXT: andi $a4, $a4, 1 -; CHECK-NEXT: alsl.d $a6, $a6, $s1, 4 -; CHECK-NEXT: pcalau12i $s1, %pc_hi20(.LJTI0_0) -; CHECK-NEXT: addi.d $s1, $s1, %pc_lo12(.LJTI0_0) -; CHECK-NEXT: slli.d $s3, $s2, 2 -; CHECK-NEXT: alsl.d $s2, $s2, $s3, 1 -; CHECK-NEXT: add.d $s2, $t5, $s2 -; CHECK-NEXT: addi.w $s4, $zero, -41 +; CHECK-NEXT: alsl.d $a6, $a6, $s2, 4 +; CHECK-NEXT: pcalau12i $s2, %pc_hi20(.LJTI0_0) +; CHECK-NEXT: addi.d $s2, $s2, %pc_lo12(.LJTI0_0) ; CHECK-NEXT: ori $s3, $zero, 1 -; CHECK-NEXT: slli.d $s4, $s4, 3 -; CHECK-NEXT: ori $s6, $zero, 3 -; CHECK-NEXT: lu32i.d $s6, 262144 +; CHECK-NEXT: ori $s4, $zero, 50 +; CHECK-NEXT: ori $s5, $zero, 3 +; CHECK-NEXT: lu32i.d $s5, 262144 ; CHECK-NEXT: b .LBB0_4 ; CHECK-NEXT: .p2align 4, , 16 ; CHECK-NEXT: .LBB0_1: # %sw.bb27.i.i ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: ori $s8, $zero, 1 +; CHECK-NEXT: ori $s7, $zero, 1 ; CHECK-NEXT: .LBB0_2: # %if.else.i106 ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: alsl.d $s5, $s0, $s0, 3 -; CHECK-NEXT: alsl.d $s0, $s5, $s0, 1 -; CHECK-NEXT: add.d $s0, $t0, $s0 -; CHECK-NEXT: ldx.bu $s8, $s0, $s8 +; CHECK-NEXT: alsl.d $s8, $s1, $s1, 3 +; CHECK-NEXT: alsl.d $s1, $s8, $s1, 1 +; CHECK-NEXT: add.d $s1, $t0, $s1 +; CHECK-NEXT: ldx.bu $s7, $s1, $s7 ; CHECK-NEXT: .LBB0_3: # %phy_tssi_get_ofdm_de.exit ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: st.b $zero, $t5, 0 -; CHECK-NEXT: st.b $s7, $t3, 0 -; CHECK-NEXT: st.b $zero, $t8, 0 -; CHECK-NEXT: st.b $zero, $t1, 0 -; CHECK-NEXT: st.b $zero, $a1, 0 +; CHECK-NEXT: st.b $zero, $t6, 0 +; CHECK-NEXT: st.b $s6, $t4, 0 +; CHECK-NEXT: st.b $zero, $fp, 0 ; CHECK-NEXT: st.b $zero, $t2, 0 -; CHECK-NEXT: st.b $s8, $a5, 0 -; CHECK-NEXT: ori $s0, $zero, 1 -; CHECK-NEXT: move $s7, $a3 +; CHECK-NEXT: st.b $zero, $a1, 0 +; CHECK-NEXT: st.b $zero, $t3, 0 +; CHECK-NEXT: st.b $s7, $a5, 0 +; CHECK-NEXT: ori $s1, $zero, 1 +; CHECK-NEXT: move $s6, $a3 ; CHECK-NEXT: .LBB0_4: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: beqz $a4, .LBB0_9 ; CHECK-NEXT: # %bb.5: # %calc_6g.i ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: move $s7, $zero +; CHECK-NEXT: move $s6, $zero ; CHECK-NEXT: bnez $zero, .LBB0_8 ; CHECK-NEXT: # %bb.6: # %calc_6g.i ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: slli.d $s8, $zero, 3 -; CHECK-NEXT: ldx.d $s8, $s8, $s1 -; CHECK-NEXT: jr $s8 +; CHECK-NEXT: slli.d $s7, $zero, 3 +; CHECK-NEXT: ldx.d $s7, $s7, $s2 +; CHECK-NEXT: jr $s7 ; CHECK-NEXT: .LBB0_7: # %sw.bb12.i.i ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: ori $s7, $zero, 1 +; CHECK-NEXT: ori $s6, $zero, 1 ; CHECK-NEXT: .LBB0_8: # %if.else58.i ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: ldx.bu $s7, $a6, $s7 +; CHECK-NEXT: ldx.bu $s6, $a6, $s6 ; CHECK-NEXT: b .LBB0_11 ; CHECK-NEXT: .p2align 4, , 16 ; CHECK-NEXT: .LBB0_9: # %if.end.i ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: andi $s7, $s7, 255 -; CHECK-NEXT: ori $s5, $zero, 50 -; CHECK-NEXT: bltu $s5, $s7, .LBB0_15 +; CHECK-NEXT: andi $s6, $s6, 255 +; CHECK-NEXT: bltu $s4, $s6, .LBB0_15 ; CHECK-NEXT: # %bb.10: # %if.end.i ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: sll.d $s7, $s3, $s7 -; CHECK-NEXT: and $s8, $s7, $s6 -; CHECK-NEXT: move $s7, $fp -; CHECK-NEXT: beqz $s8, .LBB0_15 +; CHECK-NEXT: sll.d $s6, $s3, $s6 +; CHECK-NEXT: and $s7, $s6, $s5 +; CHECK-NEXT: move $s6, $s0 +; CHECK-NEXT: beqz $s7, .LBB0_15 ; CHECK-NEXT: .LBB0_11: # %phy_tssi_get_ofdm_trim_de.exit ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: move $s8, $zero -; CHECK-NEXT: st.b $zero, $t7, 0 -; CHECK-NEXT: ldx.b $ra, $s2, $t4 +; CHECK-NEXT: move $s7, $zero +; CHECK-NEXT: st.b $zero, $t8, 0 +; CHECK-NEXT: slli.d $s8, $t1, 2 +; CHECK-NEXT: alsl.d $s8, $t1, $s8, 1 +; CHECK-NEXT: add.d $s8, $t6, $s8 +; CHECK-NEXT: ldx.b $s8, $s8, $t5 ; CHECK-NEXT: st.b $zero, $a2, 0 ; CHECK-NEXT: st.b $zero, $a7, 0 -; CHECK-NEXT: st.b $zero, $t6, 0 -; CHECK-NEXT: st.b $ra, $a0, 0 +; CHECK-NEXT: st.b $zero, $t7, 0 +; CHECK-NEXT: st.b $s8, $a0, 0 ; CHECK-NEXT: bnez $s3, .LBB0_13 ; CHECK-NEXT: # %bb.12: # %phy_tssi_get_ofdm_trim_de.exit ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 +; CHECK-NEXT: addi.w $s8, $zero, -41 +; CHECK-NEXT: slli.d $s8, $s8, 3 ; CHECK-NEXT: pcalau12i $ra, %pc_hi20(.LJTI0_1) ; CHECK-NEXT: addi.d $ra, $ra, %pc_lo12(.LJTI0_1) -; CHECK-NEXT: ldx.d $s5, $s4, $ra -; CHECK-NEXT: jr $s5 +; CHECK-NEXT: ldx.d $s8, $s8, $ra +; CHECK-NEXT: jr $s8 ; CHECK-NEXT: .LBB0_13: # %phy_tssi_get_ofdm_trim_de.exit ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 ; CHECK-NEXT: bnez $s3, .LBB0_1 diff --git a/llvm/test/CodeGen/NVPTX/misched_func_call.ll b/llvm/test/CodeGen/NVPTX/misched_func_call.ll index e036753ce9030..ee6b5869111c6 100644 --- a/llvm/test/CodeGen/NVPTX/misched_func_call.ll +++ b/llvm/test/CodeGen/NVPTX/misched_func_call.ll @@ -17,7 +17,6 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) { ; CHECK-NEXT: ld.param.u32 %r2, [wombat_param_0]; ; CHECK-NEXT: mov.b32 %r10, 0; ; CHECK-NEXT: mov.u64 %rd1, 0; -; CHECK-NEXT: mov.b32 %r6, 1; ; CHECK-NEXT: $L__BB0_1: // %bb3 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: { // callseq 0, 0 @@ -29,16 +28,16 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) { ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); +; CHECK-NEXT: ld.param.f64 %fd1, [retval0]; +; CHECK-NEXT: } // callseq 0 ; CHECK-NEXT: mul.lo.s32 %r7, %r10, %r3; ; CHECK-NEXT: or.b32 %r8, %r4, %r7; ; CHECK-NEXT: mul.lo.s32 %r9, %r2, %r8; ; CHECK-NEXT: cvt.rn.f64.s32 %fd3, %r9; -; CHECK-NEXT: ld.param.f64 %fd1, [retval0]; -; CHECK-NEXT: } // callseq 0 ; CHECK-NEXT: cvt.rn.f64.u32 %fd4, %r10; ; CHECK-NEXT: add.rn.f64 %fd5, %fd4, %fd3; ; CHECK-NEXT: st.global.f64 [%rd1], %fd5; -; CHECK-NEXT: mov.u32 %r10, %r6; +; CHECK-NEXT: mov.b32 %r10, 1; ; CHECK-NEXT: bra.uni $L__BB0_1; bb: br label %bb3 diff --git a/llvm/test/CodeGen/PowerPC/aix-csr-alloc.mir b/llvm/test/CodeGen/PowerPC/aix-csr-alloc.mir index fba410dc0dafc..7c8a5848b402f 100644 --- a/llvm/test/CodeGen/PowerPC/aix-csr-alloc.mir +++ b/llvm/test/CodeGen/PowerPC/aix-csr-alloc.mir @@ -17,5 +17,4 @@ body: | ... # CHECK-DAG: AllocationOrder(GPRC) = [ $r3 $r4 $r5 $r6 $r7 $r8 $r9 $r10 $r11 $r12 $r0 $r31 $r30 $r29 $r28 $r27 $r26 $r25 $r24 $r23 $r22 $r21 $r20 $r19 $r18 $r17 $r16 $r15 $r14 $r13 ] -# CHECK-DAG: AllocationOrder(F4RC) = [ $f0 $f1 $f2 $f3 $f4 $f5 $f6 $f7 $f8 $f9 $f10 $f11 $f12 $f13 $f31 $f30 $f29 $f28 $f27 $f26 $f25 $f24 $f23 $f22 $f21 $f20 $f19 $f18 $f17 $f16 $f15 $f14 ] # CHECK-DAG: AllocationOrder(GPRC_and_GPRC_NOR0) = [ $r3 $r4 $r5 $r6 $r7 $r8 $r9 $r10 $r11 $r12 $r31 $r30 $r29 $r28 $r27 $r26 $r25 $r24 $r23 $r22 $r21 $r20 $r19 $r18 $r17 $r16 $r15 $r14 $r13 ] diff --git a/llvm/test/CodeGen/PowerPC/aix64-csr-alloc.mir b/llvm/test/CodeGen/PowerPC/aix64-csr-alloc.mir index 584b6b0ad46dd..3617b95b2a6af 100644 --- a/llvm/test/CodeGen/PowerPC/aix64-csr-alloc.mir +++ b/llvm/test/CodeGen/PowerPC/aix64-csr-alloc.mir @@ -16,6 +16,5 @@ body: | $f1 = COPY %2 BLR8 implicit $lr8, implicit undef $rm, implicit $x3, implicit $f1 ... -# CHECK-DAG: AllocationOrder(VFRC) = [ $vf2 $vf3 $vf4 $vf5 $vf0 $vf1 $vf6 $vf7 $vf8 $vf9 $vf10 $vf11 $vf12 $vf13 $vf14 $vf15 $vf16 $vf17 $vf18 $vf19 $vf31 $vf30 $vf29 $vf28 $vf27 $vf26 $vf25 $vf24 $vf23 $vf22 $vf21 $vf20 ] # CHECK-DAG: AllocationOrder(G8RC_and_G8RC_NOX0) = [ $x3 $x4 $x5 $x6 $x7 $x8 $x9 $x10 $x11 $x12 $x2 $x31 $x30 $x29 $x28 $x27 $x26 $x25 $x24 $x23 $x22 $x21 $x20 $x19 $x18 $x17 $x16 $x15 $x14 ] # CHECK-DAG: AllocationOrder(F8RC) = [ $f0 $f1 $f2 $f3 $f4 $f5 $f6 $f7 $f8 $f9 $f10 $f11 $f12 $f13 $f31 $f30 $f29 $f28 $f27 $f26 $f25 $f24 $f23 $f22 $f21 $f20 $f19 $f18 $f17 $f16 $f15 $f14 ] diff --git a/llvm/test/CodeGen/PowerPC/compute-regpressure.ll b/llvm/test/CodeGen/PowerPC/compute-regpressure.ll index 9a1b057c2e38d..9d893b8dbebee 100644 --- a/llvm/test/CodeGen/PowerPC/compute-regpressure.ll +++ b/llvm/test/CodeGen/PowerPC/compute-regpressure.ll @@ -1,7 +1,7 @@ ; REQUIRES: asserts -; RUN: llc -debug-only=regalloc < %s 2>&1 |FileCheck %s --check-prefix=DEBUG +; RUN: llc -debug-only=target-reg-info < %s 2>&1 |FileCheck %s --check-prefix=DEBUG -; DEBUG-COUNT-1: AllocationOrder(VRSAVERC) = [ ] +; DEBUG-COUNT-1: All registers of VRSAVERC are reserved! target triple = "powerpc64le-unknown-linux-gnu" diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll index c35f05be304cc..ec2448cb3965f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll @@ -489,8 +489,9 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_ ; RV64-NEXT: j .LBB0_11 ; RV64-NEXT: .LBB0_8: # %vector.ph ; RV64-NEXT: # in Loop: Header=BB0_6 Depth=1 -; RV64-NEXT: slli t6, t0, 28 -; RV64-NEXT: sub t6, t6, t1 +; RV64-NEXT: slli t6, t0, 1 +; RV64-NEXT: slli s0, t0, 28 +; RV64-NEXT: sub t6, s0, t6 ; RV64-NEXT: and t6, t6, a6 ; RV64-NEXT: csrwi vxrm, 0 ; RV64-NEXT: mv s0, a2 diff --git a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll index 7087041e8dace..6d082802f9cd7 100644 --- a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll +++ b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll @@ -353,8 +353,8 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: mov lr, r0 @@ -364,50 +364,48 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: @ %bb.1: @ %for.cond2.preheader.lr.ph ; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: csel r7, r2, r0, lt +; CHECK-NEXT: csel r3, r2, r0, lt ; CHECK-NEXT: mov r12, r1 -; CHECK-NEXT: mov r1, r7 -; CHECK-NEXT: cmp r7, #3 +; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: it ls ; CHECK-NEXT: movls r1, #3 ; CHECK-NEXT: mov r4, r2 -; CHECK-NEXT: subs r1, r1, r7 +; CHECK-NEXT: subs r1, r1, r3 ; CHECK-NEXT: movw r2, #43691 ; CHECK-NEXT: adds r1, #2 ; CHECK-NEXT: movt r2, #43690 -; CHECK-NEXT: ldr r6, [sp, #128] -; CHECK-NEXT: movw r8, :lower16:c +; CHECK-NEXT: ldr r6, [sp, #112] +; CHECK-NEXT: movw r9, :lower16:c ; CHECK-NEXT: umull r1, r2, r1, r2 -; CHECK-NEXT: movt r8, :upper16:c +; CHECK-NEXT: adr.w r8, .LCPI1_1 ; CHECK-NEXT: movs r1, #4 -; CHECK-NEXT: @ implicit-def: $r10 ; CHECK-NEXT: @ implicit-def: $r5 ; CHECK-NEXT: @ implicit-def: $r11 -; CHECK-NEXT: mov.w r9, #12 -; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: @ implicit-def: $r7 +; CHECK-NEXT: movt r9, :upper16:c +; CHECK-NEXT: mov.w r10, #12 +; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: add.w r1, r1, r2, lsr #1 ; CHECK-NEXT: add.w r0, r0, r2, lsr #1 -; CHECK-NEXT: bic r3, r1, #3 +; CHECK-NEXT: bic r2, r1, #3 ; CHECK-NEXT: adr r1, .LCPI1_0 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: adr r1, .LCPI1_1 -; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: vdup.32 q5, r0 ; CHECK-NEXT: vdup.32 q6, r0 -; CHECK-NEXT: vadd.i32 q4, q0, r7 -; CHECK-NEXT: vdup.32 q7, r0 -; CHECK-NEXT: strd r3, r7, [sp, #4] @ 8-byte Folded Spill +; CHECK-NEXT: strd r2, r4, [sp, #4] @ 8-byte Folded Spill +; CHECK-NEXT: vadd.i32 q4, q0, r3 ; CHECK-NEXT: b .LBB1_6 ; CHECK-NEXT: .LBB1_2: @ %for.body6.preheader ; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: mov r0, r11 -; CHECK-NEXT: cmn.w r11, #4 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: cmn.w r7, #4 ; CHECK-NEXT: it le ; CHECK-NEXT: mvnle r0, #3 ; CHECK-NEXT: movw r2, #18725 ; CHECK-NEXT: adds r0, #6 ; CHECK-NEXT: movt r2, #9362 -; CHECK-NEXT: sub.w r1, r0, r11 -; CHECK-NEXT: mov r10, r3 +; CHECK-NEXT: subs r1, r0, r7 ; CHECK-NEXT: umull r2, r3, r1, r2 ; CHECK-NEXT: subs r2, r1, r3 ; CHECK-NEXT: add.w r2, r3, r2, lsr #1 @@ -415,19 +413,18 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: lsls r3, r3, #3 ; CHECK-NEXT: sub.w r2, r3, r2, lsr #2 ; CHECK-NEXT: subs r1, r2, r1 -; CHECK-NEXT: mov r3, r10 ; CHECK-NEXT: add r0, r1 ; CHECK-NEXT: .LBB1_3: @ %for.cond.cleanup5.loopexit134.split.loop.exit139 ; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: add.w r11, r0, #7 +; CHECK-NEXT: adds r7, r0, #7 ; CHECK-NEXT: .LBB1_4: @ %for.cond.cleanup5 ; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: mov.w r10, #0 +; CHECK-NEXT: movs r5, #0 ; CHECK-NEXT: .LBB1_5: @ %for.cond.cleanup5 ; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: adds r5, #2 -; CHECK-NEXT: subs.w r1, r5, lr -; CHECK-NEXT: asr.w r0, r5, #31 +; CHECK-NEXT: add.w r11, r11, #2 +; CHECK-NEXT: subs.w r1, r11, lr +; CHECK-NEXT: asr.w r0, r11, #31 ; CHECK-NEXT: sbcs.w r0, r0, r12 ; CHECK-NEXT: bge.w .LBB1_28 ; CHECK-NEXT: .LBB1_6: @ %for.cond2.preheader @@ -436,36 +433,35 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: @ Child Loop BB1_10 Depth 2 ; CHECK-NEXT: @ Child Loop BB1_12 Depth 3 ; CHECK-NEXT: @ Child Loop BB1_14 Depth 3 -; CHECK-NEXT: cmp.w r11, #2 +; CHECK-NEXT: cmp r7, #2 ; CHECK-NEXT: bgt .LBB1_5 ; CHECK-NEXT: @ %bb.7: @ %for.body6.lr.ph ; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: cmp r7, #5 +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: cmp r0, #5 ; CHECK-NEXT: bhi .LBB1_17 ; CHECK-NEXT: @ %bb.8: @ %for.body6.us.preheader ; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: ldrd r2, r3, [sp, #120] +; CHECK-NEXT: ldrd r2, r3, [sp, #104] ; CHECK-NEXT: movs r0, #32 ; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: mov r4, r6 -; CHECK-NEXT: mov r7, r12 -; CHECK-NEXT: mov r6, lr +; CHECK-NEXT: mov r6, r12 +; CHECK-NEXT: mov r4, lr ; CHECK-NEXT: bl __aeabi_ldivmod -; CHECK-NEXT: mov lr, r6 -; CHECK-NEXT: mov r6, r4 -; CHECK-NEXT: mov r12, r7 -; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: ldr r4, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: mov lr, r4 +; CHECK-NEXT: mov r12, r6 +; CHECK-NEXT: ldr r4, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mov r0, r11 +; CHECK-NEXT: ldr r6, [sp, #112] +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: b .LBB1_10 ; CHECK-NEXT: .LBB1_9: @ %for.cond.cleanup17.us ; CHECK-NEXT: @ in Loop: Header=BB1_10 Depth=2 -; CHECK-NEXT: add.w r11, r0, #7 +; CHECK-NEXT: adds r7, r0, #7 ; CHECK-NEXT: cmn.w r0, #4 -; CHECK-NEXT: mov.w r10, #0 -; CHECK-NEXT: mov r0, r11 +; CHECK-NEXT: mov.w r5, #0 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: bge .LBB1_5 ; CHECK-NEXT: .LBB1_10: @ %for.body6.us ; CHECK-NEXT: @ Parent Loop BB1_6 Depth=1 @@ -488,13 +484,14 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: @ Parent Loop BB1_6 Depth=1 ; CHECK-NEXT: @ Parent Loop BB1_10 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vqadd.u32 q2, q5, r1 +; CHECK-NEXT: vldrw.u32 q2, [r8] ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vcmp.u32 hi, q7, q2 -; CHECK-NEXT: vshl.i32 q2, q1, #2 +; CHECK-NEXT: vqadd.u32 q2, q2, r1 ; CHECK-NEXT: add.w r1, r1, #4 -; CHECK-NEXT: vadd.i32 q2, q2, r8 -; CHECK-NEXT: vadd.i32 q1, q1, r9 +; CHECK-NEXT: vcmp.u32 hi, q6, q2 +; CHECK-NEXT: vshl.i32 q2, q1, #2 +; CHECK-NEXT: vadd.i32 q2, q2, r9 +; CHECK-NEXT: vadd.i32 q1, q1, r10 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [q2] ; CHECK-NEXT: bne .LBB1_12 @@ -507,13 +504,14 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: @ Parent Loop BB1_6 Depth=1 ; CHECK-NEXT: @ Parent Loop BB1_10 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vqadd.u32 q2, q5, r1 +; CHECK-NEXT: vldrw.u32 q2, [r8] ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vcmp.u32 hi, q6, q2 -; CHECK-NEXT: vshl.i32 q2, q1, #2 +; CHECK-NEXT: vqadd.u32 q2, q2, r1 ; CHECK-NEXT: add.w r1, r1, #4 -; CHECK-NEXT: vadd.i32 q2, q2, r8 -; CHECK-NEXT: vadd.i32 q1, q1, r9 +; CHECK-NEXT: vcmp.u32 hi, q5, q2 +; CHECK-NEXT: vshl.i32 q2, q1, #2 +; CHECK-NEXT: vadd.i32 q2, q2, r9 +; CHECK-NEXT: vadd.i32 q1, q1, r10 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [q2] ; CHECK-NEXT: bne .LBB1_14 @@ -523,7 +521,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: beq .LBB1_9 ; CHECK-NEXT: @ %bb.16: @ %for.cond9.for.cond15.preheader_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB1_10 Depth=2 -; CHECK-NEXT: eor r1, r10, #1 +; CHECK-NEXT: eor r1, r5, #1 ; CHECK-NEXT: lsls r1, r1, #31 ; CHECK-NEXT: bne .LBB1_9 ; CHECK-NEXT: b .LBB1_26 @@ -532,11 +530,11 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: beq.w .LBB1_2 ; CHECK-NEXT: @ %bb.18: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: mov r0, r11 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: .LBB1_19: @ %for.body6.us60 ; CHECK-NEXT: @ Parent Loop BB1_6 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: lsls.w r1, r10, #31 +; CHECK-NEXT: lsls r1, r5, #31 ; CHECK-NEXT: bne .LBB1_27 ; CHECK-NEXT: @ %bb.20: @ %for.cond.cleanup17.us63 ; CHECK-NEXT: @ in Loop: Header=BB1_19 Depth=2 @@ -552,19 +550,19 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: bgt .LBB1_25 ; CHECK-NEXT: @ %bb.23: @ %for.cond.cleanup17.us63.3 ; CHECK-NEXT: @ in Loop: Header=BB1_19 Depth=2 -; CHECK-NEXT: add.w r11, r0, #28 +; CHECK-NEXT: add.w r7, r0, #28 ; CHECK-NEXT: cmn.w r0, #25 -; CHECK-NEXT: mov.w r10, #0 -; CHECK-NEXT: mov r0, r11 +; CHECK-NEXT: mov.w r5, #0 +; CHECK-NEXT: mov r0, r7 ; CHECK-NEXT: blt .LBB1_19 ; CHECK-NEXT: b .LBB1_5 ; CHECK-NEXT: .LBB1_24: @ %for.cond.cleanup5.loopexit134.split.loop.exit137 ; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: add.w r11, r0, #14 +; CHECK-NEXT: add.w r7, r0, #14 ; CHECK-NEXT: b .LBB1_4 ; CHECK-NEXT: .LBB1_25: @ %for.cond.cleanup5.loopexit134.split.loop.exit135 ; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1 -; CHECK-NEXT: add.w r11, r0, #21 +; CHECK-NEXT: add.w r7, r0, #21 ; CHECK-NEXT: b .LBB1_4 ; CHECK-NEXT: .LBB1_26: @ %for.inc19.us ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -574,7 +572,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) { ; CHECK-NEXT: b .LBB1_27 ; CHECK-NEXT: .LBB1_28: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #16 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll index 93cab25c2cb72..aded1eb99d892 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -539,71 +539,71 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(ptr noalias nocapture reado ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #28 -; CHECK-NEXT: sub sp, #28 +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: strd r1, r2, [sp, #4] @ 8-byte Folded Spill +; CHECK-NEXT: strd r1, r2, [sp, #8] @ 8-byte Folded Spill ; CHECK-NEXT: blt .LBB11_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader -; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: mov r9, r0 ; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: add r2, sp, #12 -; CHECK-NEXT: mov.w r9, #8 +; CHECK-NEXT: mov.w r10, #8 ; CHECK-NEXT: bic r1, r1, #7 -; CHECK-NEXT: str r1, [sp] @ 4-byte Spill -; CHECK-NEXT: sub.w r3, r1, #8 -; CHECK-NEXT: add.w r8, r6, r3, lsr #3 -; CHECK-NEXT: adr r3, .LCPI11_0 -; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: sub.w r7, r1, #8 +; CHECK-NEXT: add.w r0, r6, r7, lsr #3 +; CHECK-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-NEXT: add r0, sp, #16 ; CHECK-NEXT: .LBB11_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB11_3 Depth 2 -; CHECK-NEXT: dls lr, r8 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload +; CHECK-NEXT: dls lr, r1 +; CHECK-NEXT: adr r1, .LCPI11_0 +; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: .LBB11_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB11_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vstrw.32 q1, [r2] -; CHECK-NEXT: mov r12, r2 -; CHECK-NEXT: vldrh.s32 q2, [r2, #8] -; CHECK-NEXT: vadd.i16 q1, q1, r9 -; CHECK-NEXT: vshl.i32 q2, q2, #1 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r7, r5, d5 -; CHECK-NEXT: vmov r3, r4, d4 -; CHECK-NEXT: vldrh.s32 q2, [r2] -; CHECK-NEXT: vshl.i32 q2, q2, #1 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r1, r10, d5 -; CHECK-NEXT: ldrh r7, [r7] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vadd.i16 q0, q0, r10 +; CHECK-NEXT: vldrh.s32 q1, [r0, #8] +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vadd.i32 q1, q1, r9 +; CHECK-NEXT: vmov r3, r6, d3 +; CHECK-NEXT: vmov r5, r4, d2 +; CHECK-NEXT: vldrh.s32 q1, [r0] +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vadd.i32 q1, q1, r9 +; CHECK-NEXT: vmov r12, r11, d3 +; CHECK-NEXT: ldrh.w r8, [r6] +; CHECK-NEXT: vmov r2, r6, d2 ; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: ldrh.w r2, [r10] -; CHECK-NEXT: ldrh.w r10, [r3] -; CHECK-NEXT: vmov r3, r11, d4 -; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: ldrh.w r11, [r11] -; CHECK-NEXT: vmov.16 q2[0], r3 -; CHECK-NEXT: vmov.16 q2[1], r11 -; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: mov r2, r12 -; CHECK-NEXT: vmov.16 q2[4], r10 -; CHECK-NEXT: vmov.16 q2[5], r4 -; CHECK-NEXT: vmov.16 q2[6], r7 -; CHECK-NEXT: vmov.16 q2[7], r5 -; CHECK-NEXT: vstrb.8 q2, [r6], #16 +; CHECK-NEXT: ldrh.w r1, [r11] +; CHECK-NEXT: ldrh.w r11, [r5] +; CHECK-NEXT: ldrh.w r5, [r12] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r6, [r6] +; CHECK-NEXT: vmov.16 q1[0], r2 +; CHECK-NEXT: vmov.16 q1[1], r6 +; CHECK-NEXT: vmov.16 q1[2], r5 +; CHECK-NEXT: vmov.16 q1[3], r1 +; CHECK-NEXT: vmov.16 q1[4], r11 +; CHECK-NEXT: vmov.16 q1[5], r4 +; CHECK-NEXT: vmov.16 q1[6], r3 +; CHECK-NEXT: vmov.16 q1[7], r8 +; CHECK-NEXT: vstrb.8 q1, [r7], #16 ; CHECK-NEXT: le lr, .LBB11_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=1 -; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload -; CHECK-NEXT: cmp r3, r1 +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: cmp r2, r1 ; CHECK-NEXT: bne .LBB11_2 ; CHECK-NEXT: .LBB11_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #28 +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.6: @@ -656,145 +656,144 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #136 -; CHECK-NEXT: sub sp, #136 +; CHECK-NEXT: .pad #88 +; CHECK-NEXT: sub sp, #88 ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: strd r1, r2, [sp, #64] @ 8-byte Folded Spill +; CHECK-NEXT: strd r1, r2, [sp, #8] @ 8-byte Folded Spill ; CHECK-NEXT: blt.w .LBB12_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader -; CHECK-NEXT: ldr r1, [sp, #68] @ 4-byte Reload -; CHECK-NEXT: adr r3, .LCPI12_2 -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: movs r2, #1 +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: add r4, sp, #72 +; CHECK-NEXT: add r7, sp, #40 +; CHECK-NEXT: add r5, sp, #56 ; CHECK-NEXT: bic r1, r1, #7 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: subs r1, #8 -; CHECK-NEXT: vstrw.32 q0, [sp, #40] @ 16-byte Spill -; CHECK-NEXT: vmov.i16 q2, #0x18 -; CHECK-NEXT: add.w r1, r2, r1, lsr #3 -; CHECK-NEXT: str r1, [sp, #60] @ 4-byte Spill -; CHECK-NEXT: adr r1, .LCPI12_0 -; CHECK-NEXT: adr r2, .LCPI12_1 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q0, [sp, #24] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: add r2, sp, #120 -; CHECK-NEXT: vstrw.32 q0, [sp, #8] @ 16-byte Spill +; CHECK-NEXT: vmov.i16 q6, #0x18 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: add.w r1, r3, r1, lsr #3 +; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB12_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB12_3 Depth 2 -; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload -; CHECK-NEXT: add.w r10, sp, #104 +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-NEXT: dls lr, r1 -; CHECK-NEXT: ldr r7, [sp, #64] @ 4-byte Reload -; CHECK-NEXT: vldrw.u32 q4, [sp, #24] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q5, [sp, #40] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q6, [sp, #8] @ 16-byte Reload +; CHECK-NEXT: adr r1, .LCPI12_2 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: adr r1, .LCPI12_0 +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: adr r1, .LCPI12_1 +; CHECK-NEXT: ldr.w r12, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: vldrw.u32 q3, [r1] ; CHECK-NEXT: .LBB12_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB12_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vstrw.32 q5, [r2] -; CHECK-NEXT: mov r8, r2 -; CHECK-NEXT: vldrh.s32 q0, [r2, #8] +; CHECK-NEXT: vstrw.32 q1, [r4] +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: vldrh.s32 q0, [r4, #8] +; CHECK-NEXT: mov r11, r4 +; CHECK-NEXT: mov r5, r7 ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r1, r3, d0 -; CHECK-NEXT: vmov r4, r5, d1 -; CHECK-NEXT: vldrh.s32 q0, [r2] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q2, q0, r0 -; CHECK-NEXT: vmov r6, r2, d4 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh.w r12, [r4] -; CHECK-NEXT: add r4, sp, #88 -; CHECK-NEXT: ldrh.w r11, [r5] -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: ldrh r5, [r6] -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: vstrw.32 q6, [r4] +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r6, r10, d1 ; CHECK-NEXT: vldrh.s32 q0, [r4] -; CHECK-NEXT: vmov.16 q7[0], r5 -; CHECK-NEXT: vmov.16 q7[1], r2 +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q6, q0, r0 +; CHECK-NEXT: vmov r7, r4, d12 +; CHECK-NEXT: ldrh.w r9, [r2] +; CHECK-NEXT: ldrh.w r2, [r10] +; CHECK-NEXT: str r2, [sp, #36] @ 4-byte Spill +; CHECK-NEXT: ldrh.w r8, [r3] +; CHECK-NEXT: ldrh r3, [r6] +; CHECK-NEXT: ldrh r2, [r7] +; CHECK-NEXT: mov r7, r5 +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vstrw.32 q3, [r7] +; CHECK-NEXT: vldrh.s32 q0, [r7] +; CHECK-NEXT: vmov.16 q4[0], r2 +; CHECK-NEXT: vmov.16 q4[1], r4 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r6, r9, d0 -; CHECK-NEXT: vmov r2, r5, d1 -; CHECK-NEXT: vldrh.s32 q0, [r4, #8] +; CHECK-NEXT: vmov r4, r6, d0 +; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: vldrh.s32 q0, [r7, #8] ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: ldrh r6, [r6] -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: vmov.16 q1[0], r6 -; CHECK-NEXT: ldrh.w r6, [r9] -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q1[1], r6 -; CHECK-NEXT: vmov.16 q1[2], r2 -; CHECK-NEXT: vmov r2, r6, d0 -; CHECK-NEXT: vmov.16 q1[3], r5 +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q5[0], r4 +; CHECK-NEXT: ldrh r4, [r6] ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r6, [r6] -; CHECK-NEXT: vmov.16 q1[4], r2 -; CHECK-NEXT: vmov r2, r5, d1 -; CHECK-NEXT: vmov.16 q1[5], r6 -; CHECK-NEXT: mov r6, r10 +; CHECK-NEXT: vmov.16 q5[1], r4 +; CHECK-NEXT: vmov.16 q5[2], r1 +; CHECK-NEXT: vmov r1, r4, d0 +; CHECK-NEXT: vmov.16 q5[3], r2 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov.16 q5[4], r1 +; CHECK-NEXT: vmov r1, r2, d1 +; CHECK-NEXT: vmov.16 q5[5], r4 +; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vstrw.32 q4, [r10] -; CHECK-NEXT: vldrh.s32 q0, [r6] -; CHECK-NEXT: vmov.16 q1[6], r2 -; CHECK-NEXT: vmov.16 q1[7], r5 +; CHECK-NEXT: vstrw.32 q2, [r5] +; CHECK-NEXT: vldrh.s32 q0, [r5] +; CHECK-NEXT: vmov.16 q5[6], r1 +; CHECK-NEXT: vmov.16 q5[7], r2 ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, r5, d0 +; CHECK-NEXT: vmov r1, r2, d0 +; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q3[0], r2 -; CHECK-NEXT: vmov.16 q3[1], r5 -; CHECK-NEXT: vmov r2, r5, d5 -; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload -; CHECK-NEXT: vadd.i16 q6, q6, q2 -; CHECK-NEXT: vadd.i16 q5, q5, q2 -; CHECK-NEXT: vadd.i16 q4, q4, q2 -; CHECK-NEXT: ldrh.w r9, [r2] +; CHECK-NEXT: vmov.16 q7[0], r1 +; CHECK-NEXT: vmov.16 q7[1], r2 +; CHECK-NEXT: vmov r1, r2, d13 +; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vadd.i16 q3, q3, q6 +; CHECK-NEXT: vadd.i16 q1, q1, q6 +; CHECK-NEXT: vadd.i16 q2, q2, q6 +; CHECK-NEXT: ldrh.w r10, [r2] ; CHECK-NEXT: vmov r2, r4, d1 -; CHECK-NEXT: vldrh.s32 q0, [r6, #8] -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q7[2], r9 +; CHECK-NEXT: vldrh.s32 q0, [r5, #8] +; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vmov.16 q7[3], r5 +; CHECK-NEXT: vmov.16 q4[2], r1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov.16 q7[4], r1 -; CHECK-NEXT: vmov.16 q7[5], r3 -; CHECK-NEXT: vmov.16 q7[6], r12 -; CHECK-NEXT: vmov.16 q7[7], r11 +; CHECK-NEXT: vmov.16 q4[3], r10 +; CHECK-NEXT: vmov.16 q4[4], r9 +; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: vmov.16 q4[5], r8 +; CHECK-NEXT: vmov.16 q4[6], r3 +; CHECK-NEXT: vmov.16 q4[7], r1 ; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q3[2], r2 -; CHECK-NEXT: vmov.16 q3[3], r4 +; CHECK-NEXT: vmov.16 q7[2], r2 +; CHECK-NEXT: vmov.16 q7[3], r4 ; CHECK-NEXT: vmov r2, r4, d0 ; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q3[4], r2 -; CHECK-NEXT: vmov.16 q3[5], r4 +; CHECK-NEXT: vmov.16 q7[4], r2 +; CHECK-NEXT: vmov.16 q7[5], r4 ; CHECK-NEXT: vmov r2, r4, d1 ; CHECK-NEXT: ldrh r2, [r2] ; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q3[6], r2 -; CHECK-NEXT: mov r2, r8 -; CHECK-NEXT: vmov.16 q3[7], r4 -; CHECK-NEXT: vadd.i16 q0, q3, q1 -; CHECK-NEXT: vadd.i16 q0, q0, q7 -; CHECK-NEXT: vstrb.8 q0, [r7], #16 +; CHECK-NEXT: vmov.16 q7[6], r2 +; CHECK-NEXT: vmov.16 q7[7], r4 +; CHECK-NEXT: mov r4, r11 +; CHECK-NEXT: vadd.i16 q0, q7, q5 +; CHECK-NEXT: vadd.i16 q0, q0, q4 +; CHECK-NEXT: vstrb.8 q0, [r12], #16 ; CHECK-NEXT: le lr, .LBB12_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB12_2 Depth=1 ; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: ldr r3, [sp, #68] @ 4-byte Reload -; CHECK-NEXT: cmp r1, r3 +; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: cmp r1, r2 ; CHECK-NEXT: bne.w .LBB12_2 ; CHECK-NEXT: .LBB12_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #136 +; CHECK-NEXT: add sp, #88 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -870,260 +869,246 @@ for.cond.cleanup: ; preds = %for.body, %middle.b define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n) { ; CHECK-LABEL: gather_inc_v16i8_complex: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB13_1: @ %vector.ph.preheader ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #312 -; CHECK-NEXT: sub sp, #312 -; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: str r1, [sp, #116] @ 4-byte Spill -; CHECK-NEXT: blt.w .LBB13_5 -; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader -; CHECK-NEXT: adr r1, .LCPI13_0 -; CHECK-NEXT: adr r6, .LCPI13_8 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: adr r1, .LCPI13_1 -; CHECK-NEXT: adr r7, .LCPI13_7 -; CHECK-NEXT: adr r3, .LCPI13_6 -; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: adr r1, .LCPI13_5 -; CHECK-NEXT: bic r10, r2, #7 -; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: adr r6, .LCPI13_9 -; CHECK-NEXT: vmov.i32 q2, #0x30 -; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r7] -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: .pad #160 +; CHECK-NEXT: sub sp, #160 +; CHECK-NEXT: bic lr, r2, #7 +; CHECK-NEXT: mov r12, r1 +; CHECK-NEXT: vmov.i32 q0, #0x30 ; CHECK-NEXT: .LBB13_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB13_3 Depth 2 +; CHECK-NEXT: adr r1, .LCPI13_0 +; CHECK-NEXT: mov r8, r12 +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: adr r1, .LCPI13_1 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: adr r1, .LCPI13_8 +; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: adr r1, .LCPI13_7 +; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: adr r1, .LCPI13_9 +; CHECK-NEXT: vstrw.32 q1, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: adr r1, .LCPI13_5 +; CHECK-NEXT: mov r9, lr +; CHECK-NEXT: vstrw.32 q1, [sp, #144] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: adr r1, .LCPI13_6 +; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: adr r1, .LCPI13_3 -; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q6, [r1] ; CHECK-NEXT: adr r1, .LCPI13_4 -; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: vldrw.u32 q7, [r1] ; CHECK-NEXT: adr r1, .LCPI13_2 -; CHECK-NEXT: vldrw.u32 q3, [r1] +; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [r1] ; CHECK-NEXT: adr r1, .LCPI13_10 -; CHECK-NEXT: vstrw.32 q6, [sp, #280] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q6, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [r1] -; CHECK-NEXT: adr r1, .LCPI13_11 -; CHECK-NEXT: ldr.w r8, [sp, #116] @ 4-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #248] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q6, [sp, #264] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #216] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q7, [sp, #112] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q7, [r1] -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #200] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: mov r11, r10 -; CHECK-NEXT: vstrw.32 q6, [sp, #232] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q3, [sp, #184] @ 16-byte Spill +; CHECK-NEXT: adr r1, .LCPI13_11 +; CHECK-NEXT: vldrw.u32 q3, [r1] +; CHECK-NEXT: vstrw.32 q7, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: .LBB13_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB13_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vadd.i32 q4, q1, r0 -; CHECK-NEXT: vstrw.32 q7, [sp, #136] @ 16-byte Spill -; CHECK-NEXT: vmov r1, lr, d8 -; CHECK-NEXT: vadd.i32 q7, q7, r0 -; CHECK-NEXT: vmov r5, r4, d15 -; CHECK-NEXT: vadd.i32 q6, q0, r0 -; CHECK-NEXT: vmov r6, r7, d13 -; CHECK-NEXT: vstrw.32 q1, [sp, #152] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #296] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [sp, #168] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [sp, #248] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q3, [sp, #216] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q7, q6, r0 +; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q6, q3, r0 +; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q3, q1, r0 +; CHECK-NEXT: vmov r10, r1, d15 +; CHECK-NEXT: vmov r7, r11, d6 +; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vmov r5, r3, d13 +; CHECK-NEXT: vldrw.u32 q2, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: subs.w r9, r9, #16 ; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vstrw.32 q5, [sp, #120] @ 16-byte Spill -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: subs.w r11, r11, #16 -; CHECK-NEXT: ldrb.w r9, [r1] -; CHECK-NEXT: vmov r1, r3, d14 +; CHECK-NEXT: ldrb r6, [r1] +; CHECK-NEXT: ldrb r1, [r7] +; CHECK-NEXT: vmov r7, r4, d12 ; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r3, [r3] ; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q7[0], r1 -; CHECK-NEXT: ldrb r1, [r3] -; CHECK-NEXT: vmov.8 q7[1], r1 -; CHECK-NEXT: vmov r1, r3, d12 +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov.8 q6[0], r7 +; CHECK-NEXT: vmov.8 q6[1], r4 +; CHECK-NEXT: vmov.8 q6[2], r5 +; CHECK-NEXT: vmov r4, r5, d14 +; CHECK-NEXT: vmov.8 q6[3], r3 +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r7, [r5] +; CHECK-NEXT: vmov.8 q7[0], r4 +; CHECK-NEXT: ldrb.w r5, [r10] +; CHECK-NEXT: vmov.8 q7[1], r7 +; CHECK-NEXT: ldrb.w r7, [r11] ; CHECK-NEXT: vmov.8 q7[2], r5 -; CHECK-NEXT: ldrb r5, [r6] -; CHECK-NEXT: ldrb r6, [r4] +; CHECK-NEXT: vmov r5, r10, d5 ; CHECK-NEXT: vmov.8 q7[3], r6 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q6[0], r1 -; CHECK-NEXT: vmov r6, r1, d2 -; CHECK-NEXT: vmov.8 q6[1], r3 -; CHECK-NEXT: vmov.8 q6[2], r5 -; CHECK-NEXT: vmov.8 q6[3], r7 -; CHECK-NEXT: ldrb.w r7, [lr] -; CHECK-NEXT: vmov.8 q6[4], r9 -; CHECK-NEXT: vmov.8 q6[5], r7 -; CHECK-NEXT: ldrb r4, [r1] -; CHECK-NEXT: vmov r1, r5, d3 -; CHECK-NEXT: vldrw.u32 q1, [sp, #232] @ 16-byte Reload -; CHECK-NEXT: ldrb.w r12, [r1] -; CHECK-NEXT: vmov r1, r3, d9 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vldrw.u32 q4, [sp, #184] @ 16-byte Reload -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q6[6], r1 -; CHECK-NEXT: vmov r1, r7, d0 -; CHECK-NEXT: vmov.8 q6[7], r3 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: vmov r3, r4, d4 ; CHECK-NEXT: vmov.8 q7[4], r1 -; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #264] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vmov.8 q7[5], r7 -; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: ldrb r6, [r5] +; CHECK-NEXT: vmov r1, r5, d7 +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q3, q3, q0 ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: vmov.8 q7[6], r1 -; CHECK-NEXT: ldrb r1, [r6] -; CHECK-NEXT: vmov r7, r6, d0 -; CHECK-NEXT: vmov.8 q7[7], r3 -; CHECK-NEXT: vmov r3, lr, d1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #280] @ 16-byte Reload -; CHECK-NEXT: vmov.8 q7[8], r1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov.8 q7[9], r4 -; CHECK-NEXT: vmov r4, r1, d0 -; CHECK-NEXT: vmov.8 q7[10], r12 -; CHECK-NEXT: vmov.8 q7[11], r5 +; CHECK-NEXT: vmov r1, r7, d2 +; CHECK-NEXT: vmov.8 q7[7], r5 +; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: vmov.8 q6[4], r1 +; CHECK-NEXT: vmov r1, r5, d3 +; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vmov.8 q6[5], r7 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q6[6], r1 +; CHECK-NEXT: ldrb r1, [r3] +; CHECK-NEXT: vmov.8 q6[7], r5 +; CHECK-NEXT: vmov r3, r7, d2 +; CHECK-NEXT: vmov.8 q6[8], r1 +; CHECK-NEXT: vmov r1, r11, d3 +; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload +; CHECK-NEXT: vmov.8 q6[9], r4 +; CHECK-NEXT: vmov.8 q6[10], r6 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r5, r6, d2 +; CHECK-NEXT: ldrb r4, [r7] +; CHECK-NEXT: ldrb.w r7, [r10] ; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov.8 q6[11], r7 ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q6[8], r4 -; CHECK-NEXT: vmov r5, r4, d1 -; CHECK-NEXT: vmov.8 q6[9], r1 -; CHECK-NEXT: vadd.i32 q0, q5, r0 -; CHECK-NEXT: vldrw.u32 q5, [sp, #200] @ 16-byte Reload ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: vmov.8 q6[10], r5 -; CHECK-NEXT: vmov.8 q6[11], r4 -; CHECK-NEXT: vmov.8 q6[12], r7 -; CHECK-NEXT: vmov.8 q6[13], r6 -; CHECK-NEXT: vmov.8 q6[14], r3 -; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: vmov.8 q7[8], r5 +; CHECK-NEXT: vmov r5, r7, d3 +; CHECK-NEXT: vmov.8 q7[9], r6 +; CHECK-NEXT: vadd.i32 q1, q2, r0 +; CHECK-NEXT: vadd.i32 q2, q2, q0 +; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q2, q2, q0 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: vmov.8 q7[10], r5 +; CHECK-NEXT: vmov.8 q7[11], r7 +; CHECK-NEXT: vmov.8 q7[12], r3 +; CHECK-NEXT: vmov.8 q7[13], r4 +; CHECK-NEXT: vmov.8 q7[14], r1 +; CHECK-NEXT: vmov r1, r3, d2 ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q7[12], r1 +; CHECK-NEXT: vmov.8 q6[12], r1 ; CHECK-NEXT: ldrb r1, [r3] -; CHECK-NEXT: vmov.8 q7[13], r1 -; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vadd.i32 q0, q1, r0 -; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vstrw.32 q1, [sp, #232] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #248] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vstrw.32 q1, [sp, #248] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #152] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vmov.8 q6[13], r1 +; CHECK-NEXT: vmov r1, r3, d3 +; CHECK-NEXT: vadd.i32 q1, q5, r0 +; CHECK-NEXT: vadd.i32 q5, q5, q0 ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q7[14], r1 +; CHECK-NEXT: vmov.8 q6[14], r1 ; CHECK-NEXT: ldrb r1, [r3] -; CHECK-NEXT: vmov.8 q7[15], r1 -; CHECK-NEXT: ldrb.w r1, [lr] ; CHECK-NEXT: vmov.8 q6[15], r1 -; CHECK-NEXT: vmov r1, r3, d0 -; CHECK-NEXT: vadd.i8 q6, q6, q7 +; CHECK-NEXT: ldrb.w r1, [r11] +; CHECK-NEXT: vmov.8 q7[15], r1 +; CHECK-NEXT: vmov r1, r3, d2 +; CHECK-NEXT: vadd.i8 q6, q7, q6 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: ldrb r3, [r3] ; CHECK-NEXT: vmov.8 q7[0], r1 ; CHECK-NEXT: vmov.8 q7[1], r3 -; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vadd.i32 q0, q3, r0 -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #216] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #296] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #280] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #280] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #264] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #264] @ 16-byte Spill +; CHECK-NEXT: vmov r1, r3, d3 +; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[2], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[3], r1 -; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: vmov r1, r3, d2 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[4], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[5], r1 -; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vadd.i32 q0, q5, r0 -; CHECK-NEXT: vadd.i32 q5, q5, q2 -; CHECK-NEXT: vstrw.32 q5, [sp, #200] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q5, [sp, #120] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q5, q5, q2 +; CHECK-NEXT: vmov r1, r3, d3 +; CHECK-NEXT: vadd.i32 q1, q4, r0 +; CHECK-NEXT: vadd.i32 q4, q4, q0 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[6], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[7], r1 -; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: vmov r1, r3, d2 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[8], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[9], r1 -; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vadd.i32 q0, q4, r0 -; CHECK-NEXT: vadd.i32 q4, q4, q2 -; CHECK-NEXT: vstrw.32 q4, [sp, #184] @ 16-byte Spill +; CHECK-NEXT: vmov r1, r3, d3 +; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[10], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[11], r1 -; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: vmov r1, r3, d2 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[12], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[13], r1 -; CHECK-NEXT: vmov r1, r3, d1 +; CHECK-NEXT: vmov r1, r3, d3 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[14], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[15], r1 -; CHECK-NEXT: vadd.i8 q0, q6, q7 -; CHECK-NEXT: vldrw.u32 q7, [sp, #136] @ 16-byte Reload -; CHECK-NEXT: vstrb.8 q0, [r8], #16 -; CHECK-NEXT: vldrw.u32 q0, [sp, #168] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q7, q7, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vadd.i8 q1, q6, q7 +; CHECK-NEXT: vldrw.u32 q7, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vstrb.8 q1, [r8], #16 +; CHECK-NEXT: vadd.i32 q7, q7, q0 +; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q7, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q6, q6, q0 +; CHECK-NEXT: vadd.i32 q1, q1, q0 +; CHECK-NEXT: vadd.i32 q7, q7, q0 +; CHECK-NEXT: vstrw.32 q7, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q7, q7, q0 +; CHECK-NEXT: vstrw.32 q7, [sp, #112] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [sp, #128] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q7, q7, q0 +; CHECK-NEXT: vstrw.32 q7, [sp, #128] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q7, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q7, q7, q0 +; CHECK-NEXT: vstrw.32 q7, [sp, #144] @ 16-byte Spill ; CHECK-NEXT: bne.w .LBB13_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB13_2 Depth=1 -; CHECK-NEXT: cmp r10, r2 +; CHECK-NEXT: cmp lr, r2 ; CHECK-NEXT: bne.w .LBB13_2 -; CHECK-NEXT: .LBB13_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #312 +; CHECK-NEXT: @ %bb.5: +; CHECK-NEXT: add sp, #160 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.6: ; CHECK-NEXT: .LCPI13_0: @@ -1232,102 +1217,95 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #64 -; CHECK-NEXT: sub sp, #64 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: strd r1, r2, [sp, #56] @ 8-byte Folded Spill -; CHECK-NEXT: blt.w .LBB14_5 +; CHECK-NEXT: strd r1, r2, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: blt .LBB14_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader -; CHECK-NEXT: adr r5, .LCPI14_3 -; CHECK-NEXT: adr r7, .LCPI14_1 -; CHECK-NEXT: vldrw.u32 q0, [r5] -; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload -; CHECK-NEXT: adr r3, .LCPI14_0 -; CHECK-NEXT: adr r6, .LCPI14_2 -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r7] -; CHECK-NEXT: bic r9, r1, #7 -; CHECK-NEXT: vldrw.u32 q3, [r3] -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: mov.w lr, #16 -; CHECK-NEXT: str.w r9, [sp, #52] @ 4-byte Spill -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: mov.w r11, #16 +; CHECK-NEXT: bic r3, r1, #7 +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: .LBB14_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB14_3 Depth 2 -; CHECK-NEXT: ldr.w r8, [sp, #56] @ 4-byte Reload -; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov q4, q3 +; CHECK-NEXT: adr r1, .LCPI14_3 +; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: adr r1, .LCPI14_1 +; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: adr r1, .LCPI14_2 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adr r1, .LCPI14_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: .LBB14_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB14_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vadd.i32 q1, q5, r0 -; CHECK-NEXT: vadd.i32 q2, q4, r0 -; CHECK-NEXT: vmov r7, r3, d3 -; CHECK-NEXT: vadd.i32 q6, q0, lr -; CHECK-NEXT: vmov r5, r6, d5 -; CHECK-NEXT: subs.w r9, r9, #16 -; CHECK-NEXT: vmov r4, r10, d2 -; CHECK-NEXT: vadd.i32 q1, q7, lr -; CHECK-NEXT: vadd.i32 q4, q4, lr -; CHECK-NEXT: vadd.i32 q5, q5, lr -; CHECK-NEXT: ldrb.w r11, [r3] -; CHECK-NEXT: ldrb r3, [r7] -; CHECK-NEXT: vmov r7, r12, d4 -; CHECK-NEXT: vadd.i32 q2, q7, r0 -; CHECK-NEXT: vadd.i32 q7, q0, r0 +; CHECK-NEXT: vadd.i32 q6, q1, r0 +; CHECK-NEXT: vadd.i32 q2, q0, r0 +; CHECK-NEXT: vmov r4, r5, d13 +; CHECK-NEXT: vadd.i32 q3, q5, r11 +; CHECK-NEXT: vmov lr, r8, d4 +; CHECK-NEXT: subs r3, #16 +; CHECK-NEXT: vmov r6, r12, d5 +; CHECK-NEXT: vadd.i32 q2, q4, r11 +; CHECK-NEXT: vadd.i32 q1, q1, r11 +; CHECK-NEXT: vadd.i32 q0, q0, r11 +; CHECK-NEXT: ldrb.w r10, [r5] +; CHECK-NEXT: vmov r2, r5, d12 +; CHECK-NEXT: vadd.i32 q6, q5, r0 +; CHECK-NEXT: vadd.i32 q5, q4, r0 +; CHECK-NEXT: ldrb.w r1, [r8] +; CHECK-NEXT: ldrb.w r9, [r4] +; CHECK-NEXT: ldrb r4, [r6] +; CHECK-NEXT: ldrb.w r6, [lr] +; CHECK-NEXT: ldrb.w r12, [r12] +; CHECK-NEXT: ldrb r2, [r2] ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: vmov.8 q4[0], r2 +; CHECK-NEXT: vmov.8 q4[1], r5 +; CHECK-NEXT: vmov r8, r5, d11 +; CHECK-NEXT: vmov.8 q4[2], r9 +; CHECK-NEXT: vmov.8 q4[3], r10 +; CHECK-NEXT: vmov.8 q4[4], r6 +; CHECK-NEXT: vmov.8 q4[5], r1 +; CHECK-NEXT: vmov.8 q4[6], r4 +; CHECK-NEXT: vmov r4, r6, d10 +; CHECK-NEXT: vmov.8 q4[7], r12 +; CHECK-NEXT: vmov q5, q3 +; CHECK-NEXT: ldrb.w lr, [r5] +; CHECK-NEXT: vmov r5, r2, d13 ; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb.w r10, [r10] -; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: ldrb.w r1, [r12] -; CHECK-NEXT: vmov.8 q0[0], r7 -; CHECK-NEXT: vmov.8 q0[1], r1 -; CHECK-NEXT: vmov r1, r7, d15 -; CHECK-NEXT: vmov.8 q0[2], r5 -; CHECK-NEXT: vmov.8 q0[3], r6 -; CHECK-NEXT: vmov.8 q0[4], r4 -; CHECK-NEXT: vmov r4, r2, d4 -; CHECK-NEXT: vmov.8 q0[5], r10 -; CHECK-NEXT: vmov.8 q0[6], r3 -; CHECK-NEXT: vmov.8 q0[7], r11 -; CHECK-NEXT: ldrb r6, [r7] -; CHECK-NEXT: vmov r5, r7, d5 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: vmov.8 q4[8], r4 +; CHECK-NEXT: vmov.8 q4[9], r6 +; CHECK-NEXT: ldrb.w r9, [r2] +; CHECK-NEXT: vmov r1, r2, d12 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb.w r10, [r2] +; CHECK-NEXT: ldrb.w r2, [r8] ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r3, [r5] -; CHECK-NEXT: ldrb.w r12, [r7] -; CHECK-NEXT: ldrb r5, [r4] -; CHECK-NEXT: vmov r4, r7, d14 -; CHECK-NEXT: vmov q7, q1 -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: vmov.8 q0[8], r4 -; CHECK-NEXT: vmov.8 q0[9], r7 -; CHECK-NEXT: vmov.8 q0[10], r1 -; CHECK-NEXT: vmov.8 q0[11], r6 -; CHECK-NEXT: vmov.8 q0[12], r5 -; CHECK-NEXT: vmov.8 q0[13], r2 -; CHECK-NEXT: vmov.8 q0[14], r3 -; CHECK-NEXT: vmov.8 q0[15], r12 -; CHECK-NEXT: vstrb.8 q0, [r8], #16 -; CHECK-NEXT: vmov q0, q6 +; CHECK-NEXT: vmov.8 q4[10], r2 +; CHECK-NEXT: vmov.8 q4[11], lr +; CHECK-NEXT: vmov.8 q4[12], r1 +; CHECK-NEXT: vmov.8 q4[13], r10 +; CHECK-NEXT: vmov.8 q4[14], r5 +; CHECK-NEXT: vmov.8 q4[15], r9 +; CHECK-NEXT: vstrb.8 q4, [r7], #16 +; CHECK-NEXT: vmov q4, q2 ; CHECK-NEXT: bne .LBB14_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB14_2 Depth=1 -; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload -; CHECK-NEXT: ldr.w r9, [sp, #52] @ 4-byte Reload -; CHECK-NEXT: cmp r9, r1 +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: cmp r3, r1 ; CHECK-NEXT: bne .LBB14_2 ; CHECK-NEXT: .LBB14_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #64 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll index 7b8b884576d13..82ec62ec9f7a1 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -602,60 +602,57 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: .pad #4 ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: strd r0, r2, [sp, #24] @ 8-byte Folded Spill +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrne r0, [sp, #136] +; CHECK-NEXT: ldrne r0, [sp, #112] ; CHECK-NEXT: cmpne r0, #0 ; CHECK-NEXT: bne .LBB10_2 ; CHECK-NEXT: .LBB10_1: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #32 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .LBB10_2: @ %for.cond1.preheader.us.preheader -; CHECK-NEXT: ldr.w r12, [sp, #140] +; CHECK-NEXT: ldr.w r12, [sp, #116] ; CHECK-NEXT: movs r7, #1 -; CHECK-NEXT: mov.w r11, #0 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: bic r2, r12, #3 -; CHECK-NEXT: subs r3, r2, #4 -; CHECK-NEXT: add.w r0, r7, r3, lsr #2 -; CHECK-NEXT: ldr r7, [sp, #136] -; CHECK-NEXT: adr r3, .LCPI10_0 -; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill -; CHECK-NEXT: lsl.w r0, r12, #1 -; CHECK-NEXT: vdup.32 q1, r7 -; CHECK-NEXT: vldrw.u32 q2, [r3] -; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: lsls r6, r7, #1 -; CHECK-NEXT: vshl.i32 q3, q1, #2 -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: bic r0, r12, #3 +; CHECK-NEXT: subs r3, r0, #4 +; CHECK-NEXT: add.w r3, r7, r3, lsr #2 +; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: ldr r3, [sp, #112] +; CHECK-NEXT: lsl.w r7, r12, #1 +; CHECK-NEXT: str r7, [sp] @ 4-byte Spill +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: vdup.32 q1, r3 +; CHECK-NEXT: lsls r6, r3, #1 +; CHECK-NEXT: vshl.i32 q2, q1, #2 +; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: str r3, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: b .LBB10_5 ; CHECK-NEXT: .LBB10_3: @ %for.cond5.preheader.us73.preheader ; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: add.w r3, r0, r5, lsl #1 +; CHECK-NEXT: add.w r3, r2, r8, lsl #1 ; CHECK-NEXT: wlstp.8 lr, r6, .LBB10_4 ; CHECK-NEXT: b .LBB10_15 ; CHECK-NEXT: .LBB10_4: @ %for.cond1.for.cond.cleanup3_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: add r11, r12 -; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: add r3, r0 -; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: adds r3, #1 -; CHECK-NEXT: cmp r3, r0 +; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload +; CHECK-NEXT: add r7, r12 +; CHECK-NEXT: ldr r5, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: add r5, r3 +; CHECK-NEXT: str r5, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: adds r5, #1 +; CHECK-NEXT: cmp r5, r3 ; CHECK-NEXT: beq .LBB10_1 ; CHECK-NEXT: .LBB10_5: @ %for.cond1.preheader.us ; CHECK-NEXT: @ =>This Loop Header: Depth=1 @@ -663,21 +660,22 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: @ Child Loop BB10_11 Depth 3 ; CHECK-NEXT: @ Child Loop BB10_14 Depth 3 ; CHECK-NEXT: @ Child Loop BB10_15 Depth 2 -; CHECK-NEXT: mul r5, r3, r7 +; CHECK-NEXT: ldr r3, [sp, #112] ; CHECK-NEXT: cmp.w r12, #0 -; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: mul r8, r5, r3 ; CHECK-NEXT: beq .LBB10_3 ; CHECK-NEXT: @ %bb.6: @ %for.cond5.preheader.us.us.preheader ; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1 -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: mov.w r9, #0 ; CHECK-NEXT: b .LBB10_8 ; CHECK-NEXT: .LBB10_7: @ %for.cond5.for.cond.cleanup7_crit_edge.us.us ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: ldr r3, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: add.w r0, r8, r5 -; CHECK-NEXT: add.w r8, r8, #1 -; CHECK-NEXT: cmp r8, r7 -; CHECK-NEXT: strh.w r10, [r3, r0, lsl #1] +; CHECK-NEXT: add.w r3, r9, r8 +; CHECK-NEXT: add.w r9, r9, #1 +; CHECK-NEXT: strh.w r10, [r2, r3, lsl #1] +; CHECK-NEXT: ldr r3, [sp, #112] +; CHECK-NEXT: cmp r9, r3 ; CHECK-NEXT: beq .LBB10_4 ; CHECK-NEXT: .LBB10_8: @ %for.cond5.preheader.us.us ; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 @@ -692,46 +690,48 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: b .LBB10_13 ; CHECK-NEXT: .LBB10_10: @ %vector.ph ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vmov.i32 q4, #0x0 -; CHECK-NEXT: vmlas.i32 q5, q2, r8 -; CHECK-NEXT: dls lr, r0 -; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: adr r3, .LCPI10_0 +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vldrw.u32 q5, [r3] +; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: dls lr, r3 +; CHECK-NEXT: vmlas.i32 q4, q5, r9 +; CHECK-NEXT: ldr r3, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: .LBB10_11: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 ; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vadd.i32 q6, q5, q3 -; CHECK-NEXT: vldrh.s32 q7, [r1, q5, uxtw #1] -; CHECK-NEXT: vldrh.s32 q5, [r3], #8 -; CHECK-NEXT: vmul.i32 q5, q7, q5 -; CHECK-NEXT: vadd.i32 q4, q5, q4 -; CHECK-NEXT: vmov q5, q6 +; CHECK-NEXT: vadd.i32 q5, q4, q2 +; CHECK-NEXT: vldrh.s32 q6, [r1, q4, uxtw #1] +; CHECK-NEXT: vldrh.s32 q4, [r3], #8 +; CHECK-NEXT: vmul.i32 q4, q6, q4 +; CHECK-NEXT: vadd.i32 q3, q4, q3 +; CHECK-NEXT: vmov q4, q5 ; CHECK-NEXT: le lr, .LBB10_11 ; CHECK-NEXT: @ %bb.12: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: vaddv.u32 r10, q4 -; CHECK-NEXT: cmp r2, r12 -; CHECK-NEXT: mov r4, r2 +; CHECK-NEXT: vaddv.u32 r10, q3 +; CHECK-NEXT: cmp r0, r12 +; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: beq .LBB10_7 ; CHECK-NEXT: .LBB10_13: @ %for.body8.us.us.preheader ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: mla r3, r7, r4, r8 -; CHECK-NEXT: add.w r0, r11, r4 -; CHECK-NEXT: ldr r7, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldr r3, [sp, #112] ; CHECK-NEXT: sub.w lr, r12, r4 -; CHECK-NEXT: add.w r9, r7, r0, lsl #1 -; CHECK-NEXT: ldr r7, [sp, #136] -; CHECK-NEXT: add.w r3, r1, r3, lsl #1 +; CHECK-NEXT: ldr r5, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: mla r3, r3, r4, r9 +; CHECK-NEXT: add.w r11, r1, r3, lsl #1 +; CHECK-NEXT: adds r3, r7, r4 +; CHECK-NEXT: add.w r3, r5, r3, lsl #1 ; CHECK-NEXT: .LBB10_14: @ %for.body8.us.us ; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 ; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: ldrsh.w r4, [r3] -; CHECK-NEXT: add r3, r6 -; CHECK-NEXT: ldrsh r0, [r9], #2 -; CHECK-NEXT: smlabb r10, r4, r0, r10 +; CHECK-NEXT: ldrsh.w r5, [r11] +; CHECK-NEXT: add r11, r6 +; CHECK-NEXT: ldrsh r4, [r3], #2 +; CHECK-NEXT: smlabb r10, r5, r4, r10 ; CHECK-NEXT: le lr, .LBB10_14 ; CHECK-NEXT: b .LBB10_7 ; CHECK-NEXT: .LBB10_15: @ Parent Loop BB10_5 Depth=1 diff --git a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll index 70957ca950d71..ba910d62362dd 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll @@ -17,51 +17,40 @@ define void @arm_cmplx_dot_prod_q15(ptr noundef %pSrcA, ptr noundef %pSrcB, i32 ; CHECK-NEXT: cmp r7, #2 ; CHECK-NEXT: csel r7, r6, r5, hs ; CHECK-NEXT: add.w lr, r7, #1 -; CHECK-NEXT: mov r4, r5 -; CHECK-NEXT: vldrh.u16 q0, [r0], #32 +; CHECK-NEXT: mov r6, r5 +; CHECK-NEXT: vldrh.u16 q1, [r0], #32 ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: mov r8, r5 -; CHECK-NEXT: vldrh.u16 q1, [r1], #32 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q1 -; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q1 -; CHECK-NEXT: vldrh.u16 q3, [r1, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3 ; CHECK-NEXT: vldrh.u16 q0, [r1], #32 -; CHECK-NEXT: sub.w lr, lr, #1 -; CHECK-NEXT: cmp.w lr, #0 -; CHECK-NEXT: vldrh.u16 q1, [r0], #32 -; CHECK-NEXT: beq .LBB0_3 ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB0_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3 -; CHECK-NEXT: vldrh.u16 q3, [r1, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0 +; CHECK-NEXT: vmlsldava.s16 r8, r7, q1, q0 ; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0 -; CHECK-NEXT: vldrh.u16 q1, [r0], #32 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3 +; CHECK-NEXT: vmlaldavax.s16 r6, r5, q1, q0 +; CHECK-NEXT: vldrh.u16 q1, [r1, #-16] +; CHECK-NEXT: vmlsldava.s16 r8, r7, q2, q1 ; CHECK-NEXT: vldrh.u16 q0, [r1], #32 +; CHECK-NEXT: vmlaldavax.s16 r6, r5, q2, q1 +; CHECK-NEXT: vldrh.u16 q1, [r0], #32 ; CHECK-NEXT: le lr, .LBB0_2 -; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3 -; CHECK-NEXT: movs r6, #14 -; CHECK-NEXT: and.w r2, r6, r2, lsl #1 -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0 +; CHECK-NEXT: @ %bb.3: @ %do.body +; CHECK-NEXT: movs r4, #14 +; CHECK-NEXT: and.w r2, r4, r2, lsl #1 +; CHECK-NEXT: vmlaldavax.s16 r6, r5, q1, q0 ; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0 +; CHECK-NEXT: vmlsldava.s16 r8, r7, q1, q0 ; CHECK-NEXT: vldrh.u16 q0, [r1, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q0 +; CHECK-NEXT: vmlaldavax.s16 r6, r5, q2, q0 ; CHECK-NEXT: vctp.16 r2 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q0 +; CHECK-NEXT: vmlsldava.s16 r8, r7, q2, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrht.u16 q1, [r0] ; CHECK-NEXT: cmp r2, #9 ; CHECK-NEXT: vpsttt ; CHECK-NEXT: vldrht.u16 q0, [r1] -; CHECK-NEXT: vmlsldavat.s16 r4, r7, q1, q0 -; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q1, q0 +; CHECK-NEXT: vmlsldavat.s16 r8, r7, q1, q0 +; CHECK-NEXT: vmlaldavaxt.s16 r6, r5, q1, q0 ; CHECK-NEXT: blo .LBB0_10 ; CHECK-NEXT: @ %bb.4: @ %do.body.1 ; CHECK-NEXT: subs r2, #8 @@ -69,17 +58,17 @@ define void @arm_cmplx_dot_prod_q15(ptr noundef %pSrcA, ptr noundef %pSrcB, i32 ; CHECK-NEXT: vpstttt ; CHECK-NEXT: vldrht.u16 q0, [r0, #16] ; CHECK-NEXT: vldrht.u16 q1, [r1, #16] -; CHECK-NEXT: vmlsldavat.s16 r4, r7, q0, q1 -; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q0, q1 +; CHECK-NEXT: vmlsldavat.s16 r8, r7, q0, q1 +; CHECK-NEXT: vmlaldavaxt.s16 r6, r5, q0, q1 ; CHECK-NEXT: b .LBB0_10 ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB0_5: @ %if.else -; CHECK-NEXT: mov.w r4, #0 +; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: cbz r2, .LBB0_9 ; CHECK-NEXT: @ %bb.6: @ %while.body14.preheader ; CHECK-NEXT: lsls r6, r2, #1 -; CHECK-NEXT: mov r5, r4 -; CHECK-NEXT: mov r7, r4 +; CHECK-NEXT: mov r5, r8 +; CHECK-NEXT: mov r7, r8 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: dlstp.16 lr, r6 ; CHECK-NEXT: .p2align 2 @@ -88,22 +77,22 @@ define void @arm_cmplx_dot_prod_q15(ptr noundef %pSrcA, ptr noundef %pSrcB, i32 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16 ; CHECK-NEXT: vmlsldava.s16 r2, r7, q0, q1 -; CHECK-NEXT: vmlaldavax.s16 r4, r5, q0, q1 +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q1 ; CHECK-NEXT: letp lr, .LBB0_7 ; CHECK-NEXT: @ %bb.8: @ %if.end.loopexit177 -; CHECK-NEXT: mov r8, r4 -; CHECK-NEXT: mov r4, r2 +; CHECK-NEXT: mov r6, r8 +; CHECK-NEXT: mov r8, r2 ; CHECK-NEXT: b .LBB0_10 ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB0_9: -; CHECK-NEXT: mov r7, r4 -; CHECK-NEXT: mov.w r8, #0 -; CHECK-NEXT: mov r5, r4 +; CHECK-NEXT: mov r7, r8 +; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: mov r5, r8 ; CHECK-NEXT: .LBB0_10: @ %if.end -; CHECK-NEXT: asrl r4, r7, #6 -; CHECK-NEXT: asrl r8, r5, #6 -; CHECK-NEXT: str r4, [r3] -; CHECK-NEXT: str.w r8, [r12] +; CHECK-NEXT: asrl r8, r7, #6 +; CHECK-NEXT: asrl r6, r5, #6 +; CHECK-NEXT: str.w r8, [r3] +; CHECK-NEXT: str.w r6, [r12] ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %cmp = icmp ugt i32 %numSamples, 15 diff --git a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll index b3a0c7dffae11..85d302abfd1ae 100644 --- a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll +++ b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll @@ -138,6 +138,7 @@ define dso_local i64 @caller_argv64i1() #0 { ; WIN64-NEXT: movq %rax, %rcx ; WIN64-NEXT: movq %rax, %rdx ; WIN64-NEXT: movq %rax, %rdi +; WIN64-NEXT: movq %rax, %rsi ; WIN64-NEXT: movq %rax, %r8 ; WIN64-NEXT: movq %rax, %r9 ; WIN64-NEXT: movq %rax, %r10 @@ -145,7 +146,6 @@ define dso_local i64 @caller_argv64i1() #0 { ; WIN64-NEXT: movq %rax, %r12 ; WIN64-NEXT: movq %rax, %r14 ; WIN64-NEXT: movq %rax, %r15 -; WIN64-NEXT: movq %rax, %rsi ; WIN64-NEXT: callq test_argv64i1 ; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload @@ -178,13 +178,13 @@ define dso_local i64 @caller_argv64i1() #0 { ; LINUXOSX64-NEXT: movq %rax, %rcx ; LINUXOSX64-NEXT: movq %rax, %rdx ; LINUXOSX64-NEXT: movq %rax, %rdi +; LINUXOSX64-NEXT: movq %rax, %rsi ; LINUXOSX64-NEXT: movq %rax, %r8 ; LINUXOSX64-NEXT: movq %rax, %r9 ; LINUXOSX64-NEXT: movq %rax, %r12 ; LINUXOSX64-NEXT: movq %rax, %r13 ; LINUXOSX64-NEXT: movq %rax, %r14 ; LINUXOSX64-NEXT: movq %rax, %r15 -; LINUXOSX64-NEXT: movq %rax, %rsi ; LINUXOSX64-NEXT: pushq %rax ; LINUXOSX64-NEXT: .cfi_adjust_cfa_offset 8 ; LINUXOSX64-NEXT: pushq %rax diff --git a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll index 2081d201704f3..3aed15caa4ea7 100644 --- a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll +++ b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll @@ -987,8 +987,6 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 % ; WIN64: # %bb.0: ; WIN64-NEXT: pushq %rbp ; WIN64-NEXT: pushq %rbx -; WIN64-NEXT: # kill: def $edx killed $edx def $rdx -; WIN64-NEXT: # kill: def $esi killed $esi def $rsi ; WIN64-NEXT: # kill: def $r15d killed $r15d def $r15 ; WIN64-NEXT: # kill: def $r14d killed $r14d def $r14 ; WIN64-NEXT: # kill: def $r12d killed $r12d def $r12 @@ -996,7 +994,9 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 % ; WIN64-NEXT: # kill: def $r10d killed $r10d def $r10 ; WIN64-NEXT: # kill: def $r9d killed $r9d def $r9 ; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8 +; WIN64-NEXT: # kill: def $esi killed $esi def $rsi ; WIN64-NEXT: # kill: def $edi killed $edi def $rdi +; WIN64-NEXT: # kill: def $edx killed $edx def $rdx ; WIN64-NEXT: leal (%rdx,%rdi), %ebx ; WIN64-NEXT: movl %edx, %ebp ; WIN64-NEXT: subl %edi, %ebp @@ -1032,14 +1032,14 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 % ; ; LINUXOSX64-LABEL: testi32_inp: ; LINUXOSX64: # %bb.0: -; LINUXOSX64-NEXT: # kill: def $edx killed $edx def $rdx -; LINUXOSX64-NEXT: # kill: def $esi killed $esi def $rsi ; LINUXOSX64-NEXT: # kill: def $r14d killed $r14d def $r14 ; LINUXOSX64-NEXT: # kill: def $r13d killed $r13d def $r13 ; LINUXOSX64-NEXT: # kill: def $r12d killed $r12d def $r12 ; LINUXOSX64-NEXT: # kill: def $r9d killed $r9d def $r9 ; LINUXOSX64-NEXT: # kill: def $r8d killed $r8d def $r8 +; LINUXOSX64-NEXT: # kill: def $esi killed $esi def $rsi ; LINUXOSX64-NEXT: # kill: def $edi killed $edi def $rdi +; LINUXOSX64-NEXT: # kill: def $edx killed $edx def $rdx ; LINUXOSX64-NEXT: leal (%rdx,%rdi), %r10d ; LINUXOSX64-NEXT: movl %edx, %r11d ; LINUXOSX64-NEXT: subl %edi, %r11d diff --git a/llvm/test/CodeGen/X86/sse-regcall.ll b/llvm/test/CodeGen/X86/sse-regcall.ll index 6f0293392eef2..e014c9f895383 100644 --- a/llvm/test/CodeGen/X86/sse-regcall.ll +++ b/llvm/test/CodeGen/X86/sse-regcall.ll @@ -244,8 +244,6 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; WIN64: # %bb.0: ; WIN64-NEXT: pushq %rbp ; WIN64-NEXT: pushq %rbx -; WIN64-NEXT: # kill: def $edx killed $edx def $rdx -; WIN64-NEXT: # kill: def $esi killed $esi def $rsi ; WIN64-NEXT: # kill: def $r15d killed $r15d def $r15 ; WIN64-NEXT: # kill: def $r14d killed $r14d def $r14 ; WIN64-NEXT: # kill: def $r12d killed $r12d def $r12 @@ -253,7 +251,9 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; WIN64-NEXT: # kill: def $r10d killed $r10d def $r10 ; WIN64-NEXT: # kill: def $r9d killed $r9d def $r9 ; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8 +; WIN64-NEXT: # kill: def $esi killed $esi def $rsi ; WIN64-NEXT: # kill: def $edi killed $edi def $rdi +; WIN64-NEXT: # kill: def $edx killed $edx def $rdx ; WIN64-NEXT: leal (%rdx,%rdi), %ebx ; WIN64-NEXT: movl %edx, %ebp ; WIN64-NEXT: subl %edi, %ebp @@ -289,14 +289,14 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; ; LINUXOSX-LABEL: testi32_inp: ; LINUXOSX: # %bb.0: -; LINUXOSX-NEXT: # kill: def $edx killed $edx def $rdx -; LINUXOSX-NEXT: # kill: def $esi killed $esi def $rsi ; LINUXOSX-NEXT: # kill: def $r14d killed $r14d def $r14 ; LINUXOSX-NEXT: # kill: def $r13d killed $r13d def $r13 ; LINUXOSX-NEXT: # kill: def $r12d killed $r12d def $r12 ; LINUXOSX-NEXT: # kill: def $r9d killed $r9d def $r9 ; LINUXOSX-NEXT: # kill: def $r8d killed $r8d def $r8 +; LINUXOSX-NEXT: # kill: def $esi killed $esi def $rsi ; LINUXOSX-NEXT: # kill: def $edi killed $edi def $rdi +; LINUXOSX-NEXT: # kill: def $edx killed $edx def $rdx ; LINUXOSX-NEXT: leal (%rdx,%rdi), %r10d ; LINUXOSX-NEXT: movl %edx, %r11d ; LINUXOSX-NEXT: subl %edi, %r11d diff --git a/llvm/test/CodeGen/X86/sse-regcall4.ll b/llvm/test/CodeGen/X86/sse-regcall4.ll index c8df7a233d7e3..f66f9d9d44942 100644 --- a/llvm/test/CodeGen/X86/sse-regcall4.ll +++ b/llvm/test/CodeGen/X86/sse-regcall4.ll @@ -244,14 +244,14 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; WIN64: # %bb.0: ; WIN64-NEXT: pushq %rbp ; WIN64-NEXT: pushq %rbx -; WIN64-NEXT: # kill: def $edx killed $edx def $rdx -; WIN64-NEXT: # kill: def $esi killed $esi def $rsi ; WIN64-NEXT: # kill: def $r14d killed $r14d def $r14 ; WIN64-NEXT: # kill: def $r12d killed $r12d def $r12 ; WIN64-NEXT: # kill: def $r11d killed $r11d def $r11 ; WIN64-NEXT: # kill: def $r9d killed $r9d def $r9 ; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8 +; WIN64-NEXT: # kill: def $esi killed $esi def $rsi ; WIN64-NEXT: # kill: def $edi killed $edi def $rdi +; WIN64-NEXT: # kill: def $edx killed $edx def $rdx ; WIN64-NEXT: leal (%rdx,%rdi), %ebx ; WIN64-NEXT: movl %edx, %ebp ; WIN64-NEXT: subl %edi, %ebp @@ -288,14 +288,14 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; ; LINUXOSX-LABEL: testi32_inp: ; LINUXOSX: # %bb.0: -; LINUXOSX-NEXT: # kill: def $edx killed $edx def $rdx -; LINUXOSX-NEXT: # kill: def $esi killed $esi def $rsi ; LINUXOSX-NEXT: # kill: def $r14d killed $r14d def $r14 ; LINUXOSX-NEXT: # kill: def $r13d killed $r13d def $r13 ; LINUXOSX-NEXT: # kill: def $r12d killed $r12d def $r12 ; LINUXOSX-NEXT: # kill: def $r9d killed $r9d def $r9 ; LINUXOSX-NEXT: # kill: def $r8d killed $r8d def $r8 +; LINUXOSX-NEXT: # kill: def $esi killed $esi def $rsi ; LINUXOSX-NEXT: # kill: def $edi killed $edi def $rdi +; LINUXOSX-NEXT: # kill: def $edx killed $edx def $rdx ; LINUXOSX-NEXT: leal (%rdx,%rdi), %r10d ; LINUXOSX-NEXT: movl %edx, %r11d ; LINUXOSX-NEXT: subl %edi, %r11d diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll index f1fd05565c47e..20d3f20c1a149 100644 --- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll +++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll @@ -389,7 +389,7 @@ define void @vec128_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec128_v2i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) @@ -452,7 +452,7 @@ define void @vec128_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec128_v2f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) @@ -599,7 +599,7 @@ define void @vec128_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec128_v4i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) @@ -694,7 +694,7 @@ define void @vec128_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX512-LABEL: vec128_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) @@ -1003,7 +1003,7 @@ define void @vec256_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec256_v2i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -1079,7 +1079,7 @@ define void @vec256_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec256_v2f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -1355,7 +1355,7 @@ define void @vec256_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec256_v4i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -1550,7 +1550,7 @@ define void @vec256_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX512-LABEL: vec256_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -2170,7 +2170,7 @@ define void @vec384_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec384_v2i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -2258,7 +2258,7 @@ define void @vec384_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec384_v2f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -2722,7 +2722,7 @@ define void @vec384_v3i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX512-LABEL: vec384_v3i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vpextrb $2, %xmm0, 2(%rsi) ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: movw %ax, (%rsi) @@ -3006,7 +3006,7 @@ define void @vec384_v3i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec384_v3i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rsi) ; AVX512-NEXT: vmovd %xmm0, (%rsi) ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rdx) @@ -3664,7 +3664,7 @@ define void @vec384_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec384_v4i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -3983,7 +3983,7 @@ define void @vec384_v6i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX512-LABEL: vec384_v6i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rsi) ; AVX512-NEXT: vmovd %xmm0, (%rsi) ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rdx) @@ -4420,7 +4420,7 @@ define void @vec384_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX512-LABEL: vec384_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -5444,7 +5444,7 @@ define void @vec512_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec512_v2i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -5540,7 +5540,7 @@ define void @vec512_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec512_v2f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -5965,7 +5965,7 @@ define void @vec512_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec512_v4i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -6363,7 +6363,7 @@ define void @vec512_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX512-LABEL: vec512_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -7079,6 +7079,14 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: pushq %r13 ; SCALAR-NEXT: pushq %r12 ; SCALAR-NEXT: pushq %rbx +; SCALAR-NEXT: movzbl 20(%rdi), %eax +; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 19(%rdi), %eax +; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 18(%rdi), %eax +; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 17(%rdi), %eax +; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 16(%rdi), %eax ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 15(%rdi), %eax @@ -7087,9 +7095,9 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 13(%rdi), %eax ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 12(%rdi), %r13d -; SCALAR-NEXT: movzbl 11(%rdi), %eax +; SCALAR-NEXT: movzbl 12(%rdi), %eax ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 11(%rdi), %r13d ; SCALAR-NEXT: movzbl 10(%rdi), %r12d ; SCALAR-NEXT: movzbl 9(%rdi), %r15d ; SCALAR-NEXT: movzbl 8(%rdi), %r14d @@ -7123,55 +7131,51 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r12b ; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SCALAR-NEXT: notb %r11b -; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r13b ; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill ; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SCALAR-NEXT: notb %r12b +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SCALAR-NEXT: notb %r10b +; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SCALAR-NEXT: notb %r9b +; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload ; SCALAR-NEXT: notb %r8b -; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SCALAR-NEXT: movzbl 17(%rdi), %eax -; SCALAR-NEXT: notb %al -; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 18(%rdi), %eax -; SCALAR-NEXT: notb %al -; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 19(%rdi), %eax +; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 21(%rdi), %eax ; SCALAR-NEXT: notb %al ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 20(%rdi), %eax +; SCALAR-NEXT: movzbl 22(%rdi), %ebx +; SCALAR-NEXT: notb %bl +; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 23(%rdi), %eax ; SCALAR-NEXT: notb %al ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 21(%rdi), %ebp +; SCALAR-NEXT: movzbl 24(%rdi), %ebp ; SCALAR-NEXT: notb %bpl ; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 22(%rdi), %ebx -; SCALAR-NEXT: notb %bl -; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 23(%rdi), %r10d -; SCALAR-NEXT: notb %r10b -; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 24(%rdi), %r9d -; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 25(%rdi), %ecx -; SCALAR-NEXT: notb %cl -; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 25(%rdi), %r11d +; SCALAR-NEXT: notb %r11b +; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 26(%rdi), %r14d ; SCALAR-NEXT: notb %r14b ; SCALAR-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 27(%rdi), %r15d ; SCALAR-NEXT: notb %r15b ; SCALAR-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 28(%rdi), %r12d -; SCALAR-NEXT: notb %r12b -; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 29(%rdi), %r13d +; SCALAR-NEXT: movzbl 28(%rdi), %r13d ; SCALAR-NEXT: notb %r13b ; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 29(%rdi), %ecx +; SCALAR-NEXT: notb %cl +; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 30(%rdi), %eax ; SCALAR-NEXT: notb %al ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill @@ -7180,57 +7184,56 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movb %dil, 31(%rsi) ; SCALAR-NEXT: movb %al, 30(%rsi) -; SCALAR-NEXT: movb %r13b, 29(%rsi) -; SCALAR-NEXT: movb %r12b, 28(%rsi) +; SCALAR-NEXT: movb %cl, 29(%rsi) +; SCALAR-NEXT: movb %r13b, 28(%rsi) ; SCALAR-NEXT: movb %r15b, 27(%rsi) ; SCALAR-NEXT: movb %r14b, 26(%rsi) -; SCALAR-NEXT: movb %cl, 25(%rsi) -; SCALAR-NEXT: movb %r9b, 24(%rsi) -; SCALAR-NEXT: movb %r10b, 23(%rsi) -; SCALAR-NEXT: movb %bl, 22(%rsi) -; SCALAR-NEXT: movb %bpl, 21(%rsi) +; SCALAR-NEXT: movb %r11b, 25(%rsi) +; SCALAR-NEXT: movb %bpl, 24(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SCALAR-NEXT: movb %bpl, 20(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 19(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 18(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 17(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 16(%rsi) -; SCALAR-NEXT: movb %r8b, 15(%rsi) -; SCALAR-NEXT: movl %r8d, %r14d -; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movb %bpl, 23(%rsi) +; SCALAR-NEXT: movb %bl, 22(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 14(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 13(%rsi) +; SCALAR-NEXT: movb %bl, 21(%rsi) +; SCALAR-NEXT: movb %r8b, 20(%rsi) +; SCALAR-NEXT: movb %r9b, 19(%rsi) +; SCALAR-NEXT: movb %r10b, 18(%rsi) +; SCALAR-NEXT: movb %r12b, 17(%rsi) +; SCALAR-NEXT: movl %r12d, %r15d +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r12b, 16(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r14b, 15(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 12(%rsi) -; SCALAR-NEXT: movb %r11b, 11(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SCALAR-NEXT: movb %dil, 10(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SCALAR-NEXT: movb %dil, 9(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SCALAR-NEXT: movb %dil, 8(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r11b, 7(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r13b, 6(%rsi) +; SCALAR-NEXT: movb %al, 14(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 13(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 12(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 11(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 10(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r10b, 5(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r12b, 4(%rsi) +; SCALAR-NEXT: movb %r10b, 9(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 8(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r9b, 3(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r15b, 2(%rsi) +; SCALAR-NEXT: movb %r9b, 7(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r13b, 6(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r8b, 1(%rsi) +; SCALAR-NEXT: movb %r8b, 5(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 4(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SCALAR-NEXT: movb %dil, 3(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SCALAR-NEXT: movb %dil, 2(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SCALAR-NEXT: movb %dil, (%rsi) +; SCALAR-NEXT: movb %dil, 1(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r11b, (%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; SCALAR-NEXT: movb %sil, 31(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload @@ -7247,92 +7250,92 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movb %sil, 25(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; SCALAR-NEXT: movb %sil, 24(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SCALAR-NEXT: movb %sil, 23(%rdx) +; SCALAR-NEXT: movb %bpl, 23(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; SCALAR-NEXT: movb %sil, 22(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SCALAR-NEXT: movb %sil, 21(%rdx) -; SCALAR-NEXT: movb %bpl, 20(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SCALAR-NEXT: movb %sil, 19(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SCALAR-NEXT: movb %sil, 18(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SCALAR-NEXT: movb %sil, 17(%rdx) -; SCALAR-NEXT: movb %cl, 16(%rdx) +; SCALAR-NEXT: movb %bl, 21(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SCALAR-NEXT: movb %bl, 20(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SCALAR-NEXT: movb %bl, 19(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SCALAR-NEXT: movb %bl, 18(%rdx) +; SCALAR-NEXT: movb %r15b, 17(%rdx) +; SCALAR-NEXT: movb %r12b, 16(%rdx) ; SCALAR-NEXT: movb %r14b, 15(%rdx) -; SCALAR-NEXT: movb %bl, 14(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 13(%rdx) +; SCALAR-NEXT: movb %al, 14(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SCALAR-NEXT: movb %bpl, 13(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; SCALAR-NEXT: movb %al, 12(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SCALAR-NEXT: movb %sil, 11(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r12b, 11(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload ; SCALAR-NEXT: movb %bl, 10(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r14b, 9(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SCALAR-NEXT: movb %bpl, 8(%rdx) -; SCALAR-NEXT: movb %r11b, 7(%rdx) +; SCALAR-NEXT: movb %r10b, 9(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 8(%rdx) +; SCALAR-NEXT: movb %r9b, 7(%rdx) ; SCALAR-NEXT: movb %r13b, 6(%rdx) -; SCALAR-NEXT: movb %r10b, 5(%rdx) -; SCALAR-NEXT: movb %r12b, 4(%rdx) +; SCALAR-NEXT: movb %r8b, 5(%rdx) +; SCALAR-NEXT: movb %cl, 4(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload ; SCALAR-NEXT: movb %r9b, 3(%rdx) -; SCALAR-NEXT: movb %r15b, 2(%rdx) -; SCALAR-NEXT: movb %r8b, 1(%rdx) -; SCALAR-NEXT: movb %dil, (%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 63(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 62(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 61(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 60(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 59(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 58(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 57(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 56(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 55(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 54(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 53(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 52(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 51(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 50(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 49(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 48(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r8b, 2(%rdx) +; SCALAR-NEXT: movb %dil, 1(%rdx) +; SCALAR-NEXT: movb %r11b, (%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 63(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 62(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 61(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 60(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 59(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 58(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 57(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 56(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 55(%rdx) +; SCALAR-NEXT: movb %sil, 54(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 53(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 52(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 51(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 50(%rdx) +; SCALAR-NEXT: movb %r15b, 49(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 48(%rdx) +; SCALAR-NEXT: movb %r14b, 47(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 46(%rdx) +; SCALAR-NEXT: movb %bpl, 45(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 44(%rdx) +; SCALAR-NEXT: movb %r12b, 43(%rdx) +; SCALAR-NEXT: movb %bl, 42(%rdx) +; SCALAR-NEXT: movb %r10b, 41(%rdx) +; SCALAR-NEXT: movb %al, 40(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 47(%rdx) +; SCALAR-NEXT: movb %al, 39(%rdx) +; SCALAR-NEXT: movb %r13b, 38(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 46(%rdx) -; SCALAR-NEXT: movb %cl, 45(%rdx) +; SCALAR-NEXT: movb %al, 37(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 44(%rdx) -; SCALAR-NEXT: movb %sil, 43(%rdx) -; SCALAR-NEXT: movb %bl, 42(%rdx) -; SCALAR-NEXT: movb %r14b, 41(%rdx) -; SCALAR-NEXT: movb %bpl, 40(%rdx) -; SCALAR-NEXT: movb %r11b, 39(%rdx) -; SCALAR-NEXT: movb %r13b, 38(%rdx) -; SCALAR-NEXT: movb %r10b, 37(%rdx) -; SCALAR-NEXT: movb %r12b, 36(%rdx) +; SCALAR-NEXT: movb %al, 36(%rdx) ; SCALAR-NEXT: movb %r9b, 35(%rdx) -; SCALAR-NEXT: movb %r15b, 34(%rdx) -; SCALAR-NEXT: movb %r8b, 33(%rdx) -; SCALAR-NEXT: movb %dil, 32(%rdx) +; SCALAR-NEXT: movb %r8b, 34(%rdx) +; SCALAR-NEXT: movb %dil, 33(%rdx) +; SCALAR-NEXT: movb %r11b, 32(%rdx) ; SCALAR-NEXT: popq %rbx ; SCALAR-NEXT: popq %r12 ; SCALAR-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll index b1194bedc4e1c..c8aa7cf8c8f29 100644 --- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll @@ -1198,6 +1198,14 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movq %rdx, %r8 ; CHECK-BASELINE-NEXT: movq %rsi, %r9 ; CHECK-BASELINE-NEXT: movq %rdi, %r11 +; CHECK-BASELINE-NEXT: movzbl 19(%rdx), %eax +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 18(%rdx), %eax +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 17(%rdx), %eax +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 16(%rdx), %eax +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 15(%rdx), %eax ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 14(%rdx), %eax @@ -1306,26 +1314,26 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: andb 15(%r10), %al ; CHECK-BASELINE-NEXT: xorb %cl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 16(%r8), %eax -; CHECK-BASELINE-NEXT: movzbl 16(%r9), %ecx -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 16(%r10), %cl -; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 17(%r8), %eax +; CHECK-BASELINE-NEXT: movzbl 16(%r9), %eax +; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: andb 16(%r10), %al +; CHECK-BASELINE-NEXT: xorb %cl, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 17(%r9), %ecx +; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: andb 17(%r10), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 18(%r8), %eax ; CHECK-BASELINE-NEXT: movzbl 18(%r9), %ecx +; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: andb 18(%r10), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 19(%r8), %eax ; CHECK-BASELINE-NEXT: movzbl 19(%r9), %ecx +; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: andb 19(%r10), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl @@ -1465,6 +1473,14 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: movq %rdx, %r8 ; CHECK-SSE1-NEXT: movq %rsi, %r9 ; CHECK-SSE1-NEXT: movq %rdi, %r11 +; CHECK-SSE1-NEXT: movzbl 19(%rdx), %eax +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 18(%rdx), %eax +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 17(%rdx), %eax +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 16(%rdx), %eax +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 15(%rdx), %eax ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 14(%rdx), %eax @@ -1573,26 +1589,26 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: andb 15(%r10), %al ; CHECK-SSE1-NEXT: xorb %cl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 16(%r8), %eax -; CHECK-SSE1-NEXT: movzbl 16(%r9), %ecx -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 16(%r10), %cl -; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 17(%r8), %eax +; CHECK-SSE1-NEXT: movzbl 16(%r9), %eax +; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: andb 16(%r10), %al +; CHECK-SSE1-NEXT: xorb %cl, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 17(%r9), %ecx +; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: andb 17(%r10), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 18(%r8), %eax ; CHECK-SSE1-NEXT: movzbl 18(%r9), %ecx +; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: andb 18(%r10), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 19(%r8), %eax ; CHECK-SSE1-NEXT: movzbl 19(%r9), %ecx +; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: andb 19(%r10), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl @@ -3231,10 +3247,18 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: pushq %r13 ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx -; CHECK-BASELINE-NEXT: movq %rcx, %r12 -; CHECK-BASELINE-NEXT: movq %rdx, %r15 +; CHECK-BASELINE-NEXT: movq %rcx, %r15 +; CHECK-BASELINE-NEXT: movq %rdx, %rbx ; CHECK-BASELINE-NEXT: movq %rsi, %r14 -; CHECK-BASELINE-NEXT: movq %rdi, %r13 +; CHECK-BASELINE-NEXT: movq %rdi, %r12 +; CHECK-BASELINE-NEXT: movzbl 19(%rdx), %eax +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 18(%rdx), %eax +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 17(%rdx), %eax +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 16(%rdx), %eax +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 15(%rdx), %eax ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 14(%rdx), %eax @@ -3247,241 +3271,241 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 10(%rdx), %eax ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %r8d -; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %r9d -; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r10d -; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %ebp -; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %edi +; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %r13d +; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %r10d +; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r9d +; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %r8d +; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %ebp ; CHECK-BASELINE-NEXT: movzbl 4(%rdx), %esi -; CHECK-BASELINE-NEXT: movzbl 3(%rdx), %eax -; CHECK-BASELINE-NEXT: movzbl 2(%rdx), %ecx -; CHECK-BASELINE-NEXT: movzbl (%rdx), %r11d -; CHECK-BASELINE-NEXT: movzbl 1(%rdx), %edx -; CHECK-BASELINE-NEXT: movzbl (%r14), %ebx -; CHECK-BASELINE-NEXT: xorb %r11b, %bl -; CHECK-BASELINE-NEXT: andb (%r12), %bl -; CHECK-BASELINE-NEXT: xorb %r11b, %bl -; CHECK-BASELINE-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 1(%r14), %r11d -; CHECK-BASELINE-NEXT: xorb %dl, %r11b -; CHECK-BASELINE-NEXT: andb 1(%r12), %r11b -; CHECK-BASELINE-NEXT: xorb %dl, %r11b +; CHECK-BASELINE-NEXT: movzbl 3(%rdx), %edx +; CHECK-BASELINE-NEXT: movzbl 2(%rbx), %eax +; CHECK-BASELINE-NEXT: movzbl (%rbx), %edi +; CHECK-BASELINE-NEXT: movzbl 1(%rbx), %ecx +; CHECK-BASELINE-NEXT: movzbl (%r14), %r11d +; CHECK-BASELINE-NEXT: xorb %dil, %r11b +; CHECK-BASELINE-NEXT: andb (%r15), %r11b +; CHECK-BASELINE-NEXT: xorb %dil, %r11b ; CHECK-BASELINE-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 2(%r14), %edx -; CHECK-BASELINE-NEXT: xorb %cl, %dl -; CHECK-BASELINE-NEXT: andb 2(%r12), %dl -; CHECK-BASELINE-NEXT: xorb %cl, %dl -; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 3(%r14), %ecx +; CHECK-BASELINE-NEXT: movzbl 1(%r14), %edi +; CHECK-BASELINE-NEXT: xorb %cl, %dil +; CHECK-BASELINE-NEXT: andb 1(%r15), %dil +; CHECK-BASELINE-NEXT: xorb %cl, %dil +; CHECK-BASELINE-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 2(%r14), %ecx ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 3(%r12), %cl +; CHECK-BASELINE-NEXT: andb 2(%r15), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 3(%r14), %eax +; CHECK-BASELINE-NEXT: xorb %dl, %al +; CHECK-BASELINE-NEXT: andb 3(%r15), %al +; CHECK-BASELINE-NEXT: xorb %dl, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 4(%r14), %eax ; CHECK-BASELINE-NEXT: xorb %sil, %al -; CHECK-BASELINE-NEXT: andb 4(%r12), %al +; CHECK-BASELINE-NEXT: andb 4(%r15), %al ; CHECK-BASELINE-NEXT: xorb %sil, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 5(%r14), %eax -; CHECK-BASELINE-NEXT: xorb %dil, %al -; CHECK-BASELINE-NEXT: andb 5(%r12), %al -; CHECK-BASELINE-NEXT: xorb %dil, %al -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 6(%r14), %eax ; CHECK-BASELINE-NEXT: xorb %bpl, %al -; CHECK-BASELINE-NEXT: andb 6(%r12), %al +; CHECK-BASELINE-NEXT: andb 5(%r15), %al ; CHECK-BASELINE-NEXT: xorb %bpl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 7(%r14), %eax -; CHECK-BASELINE-NEXT: xorb %r10b, %al -; CHECK-BASELINE-NEXT: andb 7(%r12), %al -; CHECK-BASELINE-NEXT: xorb %r10b, %al +; CHECK-BASELINE-NEXT: movzbl 6(%r14), %eax +; CHECK-BASELINE-NEXT: xorb %r8b, %al +; CHECK-BASELINE-NEXT: andb 6(%r15), %al +; CHECK-BASELINE-NEXT: xorb %r8b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 8(%r14), %eax +; CHECK-BASELINE-NEXT: movzbl 7(%r14), %eax ; CHECK-BASELINE-NEXT: xorb %r9b, %al -; CHECK-BASELINE-NEXT: andb 8(%r12), %al +; CHECK-BASELINE-NEXT: andb 7(%r15), %al ; CHECK-BASELINE-NEXT: xorb %r9b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 8(%r14), %eax +; CHECK-BASELINE-NEXT: xorb %r10b, %al +; CHECK-BASELINE-NEXT: andb 8(%r15), %al +; CHECK-BASELINE-NEXT: xorb %r10b, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 9(%r14), %eax -; CHECK-BASELINE-NEXT: xorb %r8b, %al -; CHECK-BASELINE-NEXT: andb 9(%r12), %al -; CHECK-BASELINE-NEXT: xorb %r8b, %al +; CHECK-BASELINE-NEXT: xorb %r13b, %al +; CHECK-BASELINE-NEXT: andb 9(%r15), %al +; CHECK-BASELINE-NEXT: xorb %r13b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 10(%r14), %ecx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 10(%r12), %cl +; CHECK-BASELINE-NEXT: andb 10(%r15), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 11(%r14), %ecx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 11(%r12), %cl +; CHECK-BASELINE-NEXT: andb 11(%r15), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 12(%r14), %ecx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 12(%r12), %cl +; CHECK-BASELINE-NEXT: andb 12(%r15), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 13(%r14), %ecx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 13(%r12), %cl +; CHECK-BASELINE-NEXT: andb 13(%r15), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 14(%r14), %ecx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 14(%r12), %cl +; CHECK-BASELINE-NEXT: andb 14(%r15), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 15(%r14), %ecx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 15(%r12), %cl +; CHECK-BASELINE-NEXT: andb 15(%r15), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 16(%r15), %eax ; CHECK-BASELINE-NEXT: movzbl 16(%r14), %ecx +; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 16(%r12), %cl +; CHECK-BASELINE-NEXT: andb 16(%r15), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 17(%r15), %eax ; CHECK-BASELINE-NEXT: movzbl 17(%r14), %ecx +; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 17(%r12), %cl +; CHECK-BASELINE-NEXT: andb 17(%r15), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 18(%r15), %eax ; CHECK-BASELINE-NEXT: movzbl 18(%r14), %ecx +; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 18(%r12), %cl +; CHECK-BASELINE-NEXT: andb 18(%r15), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 19(%r15), %eax ; CHECK-BASELINE-NEXT: movzbl 19(%r14), %ecx +; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 19(%r12), %cl +; CHECK-BASELINE-NEXT: andb 19(%r15), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 20(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 20(%rbx), %eax ; CHECK-BASELINE-NEXT: movzbl 20(%r14), %ecx ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 20(%r12), %cl +; CHECK-BASELINE-NEXT: andb 20(%r15), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 21(%r15), %eax -; CHECK-BASELINE-NEXT: movzbl 21(%r14), %ebp +; CHECK-BASELINE-NEXT: movzbl 21(%rbx), %eax +; CHECK-BASELINE-NEXT: movzbl 21(%r14), %r13d +; CHECK-BASELINE-NEXT: xorb %al, %r13b +; CHECK-BASELINE-NEXT: andb 21(%r15), %r13b +; CHECK-BASELINE-NEXT: xorb %al, %r13b +; CHECK-BASELINE-NEXT: movzbl 22(%rbx), %eax +; CHECK-BASELINE-NEXT: movzbl 22(%r14), %ebp ; CHECK-BASELINE-NEXT: xorb %al, %bpl -; CHECK-BASELINE-NEXT: andb 21(%r12), %bpl +; CHECK-BASELINE-NEXT: andb 22(%r15), %bpl ; CHECK-BASELINE-NEXT: xorb %al, %bpl -; CHECK-BASELINE-NEXT: movzbl 22(%r15), %eax -; CHECK-BASELINE-NEXT: movzbl 22(%r14), %ebx -; CHECK-BASELINE-NEXT: xorb %al, %bl -; CHECK-BASELINE-NEXT: andb 22(%r12), %bl -; CHECK-BASELINE-NEXT: xorb %al, %bl -; CHECK-BASELINE-NEXT: movzbl 23(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 23(%rbx), %eax ; CHECK-BASELINE-NEXT: movzbl 23(%r14), %r11d ; CHECK-BASELINE-NEXT: xorb %al, %r11b -; CHECK-BASELINE-NEXT: andb 23(%r12), %r11b +; CHECK-BASELINE-NEXT: andb 23(%r15), %r11b ; CHECK-BASELINE-NEXT: xorb %al, %r11b -; CHECK-BASELINE-NEXT: movzbl 24(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 24(%rbx), %eax ; CHECK-BASELINE-NEXT: movzbl 24(%r14), %r9d ; CHECK-BASELINE-NEXT: xorb %al, %r9b -; CHECK-BASELINE-NEXT: andb 24(%r12), %r9b +; CHECK-BASELINE-NEXT: andb 24(%r15), %r9b ; CHECK-BASELINE-NEXT: xorb %al, %r9b -; CHECK-BASELINE-NEXT: movzbl 25(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 25(%rbx), %eax ; CHECK-BASELINE-NEXT: movzbl 25(%r14), %r8d ; CHECK-BASELINE-NEXT: xorb %al, %r8b -; CHECK-BASELINE-NEXT: andb 25(%r12), %r8b +; CHECK-BASELINE-NEXT: andb 25(%r15), %r8b ; CHECK-BASELINE-NEXT: xorb %al, %r8b -; CHECK-BASELINE-NEXT: movzbl 26(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 26(%rbx), %eax ; CHECK-BASELINE-NEXT: movzbl 26(%r14), %edi ; CHECK-BASELINE-NEXT: xorb %al, %dil -; CHECK-BASELINE-NEXT: andb 26(%r12), %dil +; CHECK-BASELINE-NEXT: andb 26(%r15), %dil ; CHECK-BASELINE-NEXT: xorb %al, %dil -; CHECK-BASELINE-NEXT: movzbl 27(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 27(%rbx), %eax ; CHECK-BASELINE-NEXT: movzbl 27(%r14), %esi ; CHECK-BASELINE-NEXT: xorb %al, %sil -; CHECK-BASELINE-NEXT: andb 27(%r12), %sil +; CHECK-BASELINE-NEXT: andb 27(%r15), %sil ; CHECK-BASELINE-NEXT: xorb %al, %sil -; CHECK-BASELINE-NEXT: movzbl 28(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 28(%rbx), %eax ; CHECK-BASELINE-NEXT: movzbl 28(%r14), %edx ; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: andb 28(%r12), %dl +; CHECK-BASELINE-NEXT: andb 28(%r15), %dl ; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: movzbl 29(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 29(%rbx), %eax ; CHECK-BASELINE-NEXT: movzbl 29(%r14), %ecx ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 29(%r12), %cl +; CHECK-BASELINE-NEXT: andb 29(%r15), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movzbl 30(%r15), %r10d +; CHECK-BASELINE-NEXT: movzbl 30(%rbx), %r10d ; CHECK-BASELINE-NEXT: movzbl 30(%r14), %eax ; CHECK-BASELINE-NEXT: xorb %r10b, %al -; CHECK-BASELINE-NEXT: andb 30(%r12), %al +; CHECK-BASELINE-NEXT: andb 30(%r15), %al ; CHECK-BASELINE-NEXT: xorb %r10b, %al -; CHECK-BASELINE-NEXT: movzbl 31(%r15), %r10d -; CHECK-BASELINE-NEXT: movzbl 31(%r14), %r14d -; CHECK-BASELINE-NEXT: xorb %r10b, %r14b -; CHECK-BASELINE-NEXT: andb 31(%r12), %r14b -; CHECK-BASELINE-NEXT: xorb %r10b, %r14b -; CHECK-BASELINE-NEXT: movb %r14b, 31(%r13) -; CHECK-BASELINE-NEXT: movb %al, 30(%r13) -; CHECK-BASELINE-NEXT: movb %cl, 29(%r13) -; CHECK-BASELINE-NEXT: movb %dl, 28(%r13) -; CHECK-BASELINE-NEXT: movb %sil, 27(%r13) -; CHECK-BASELINE-NEXT: movb %dil, 26(%r13) -; CHECK-BASELINE-NEXT: movb %r8b, 25(%r13) -; CHECK-BASELINE-NEXT: movb %r9b, 24(%r13) -; CHECK-BASELINE-NEXT: movb %r11b, 23(%r13) -; CHECK-BASELINE-NEXT: movb %bl, 22(%r13) -; CHECK-BASELINE-NEXT: movb %bpl, 21(%r13) +; CHECK-BASELINE-NEXT: movzbl 31(%rbx), %r10d +; CHECK-BASELINE-NEXT: movzbl 31(%r14), %ebx +; CHECK-BASELINE-NEXT: xorb %r10b, %bl +; CHECK-BASELINE-NEXT: andb 31(%r15), %bl +; CHECK-BASELINE-NEXT: xorb %r10b, %bl +; CHECK-BASELINE-NEXT: movb %bl, 31(%r12) +; CHECK-BASELINE-NEXT: movb %al, 30(%r12) +; CHECK-BASELINE-NEXT: movb %cl, 29(%r12) +; CHECK-BASELINE-NEXT: movb %dl, 28(%r12) +; CHECK-BASELINE-NEXT: movb %sil, 27(%r12) +; CHECK-BASELINE-NEXT: movb %dil, 26(%r12) +; CHECK-BASELINE-NEXT: movb %r8b, 25(%r12) +; CHECK-BASELINE-NEXT: movb %r9b, 24(%r12) +; CHECK-BASELINE-NEXT: movb %r11b, 23(%r12) +; CHECK-BASELINE-NEXT: movb %bpl, 22(%r12) +; CHECK-BASELINE-NEXT: movb %r13b, 21(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 20(%r13) +; CHECK-BASELINE-NEXT: movb %al, 20(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 19(%r13) +; CHECK-BASELINE-NEXT: movb %al, 19(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 18(%r13) +; CHECK-BASELINE-NEXT: movb %al, 18(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 17(%r13) +; CHECK-BASELINE-NEXT: movb %al, 17(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 16(%r13) +; CHECK-BASELINE-NEXT: movb %al, 16(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 15(%r13) +; CHECK-BASELINE-NEXT: movb %al, 15(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 14(%r13) +; CHECK-BASELINE-NEXT: movb %al, 14(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 13(%r13) +; CHECK-BASELINE-NEXT: movb %al, 13(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 12(%r13) +; CHECK-BASELINE-NEXT: movb %al, 12(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 11(%r13) +; CHECK-BASELINE-NEXT: movb %al, 11(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 10(%r13) +; CHECK-BASELINE-NEXT: movb %al, 10(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 9(%r13) +; CHECK-BASELINE-NEXT: movb %al, 9(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 8(%r13) +; CHECK-BASELINE-NEXT: movb %al, 8(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 7(%r13) +; CHECK-BASELINE-NEXT: movb %al, 7(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 6(%r13) +; CHECK-BASELINE-NEXT: movb %al, 6(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 5(%r13) +; CHECK-BASELINE-NEXT: movb %al, 5(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 4(%r13) +; CHECK-BASELINE-NEXT: movb %al, 4(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 3(%r13) +; CHECK-BASELINE-NEXT: movb %al, 3(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 2(%r13) +; CHECK-BASELINE-NEXT: movb %al, 2(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 1(%r13) +; CHECK-BASELINE-NEXT: movb %al, 1(%r12) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, (%r13) -; CHECK-BASELINE-NEXT: movq %r13, %rax +; CHECK-BASELINE-NEXT: movb %al, (%r12) +; CHECK-BASELINE-NEXT: movq %r12, %rax ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: popq %r12 ; CHECK-BASELINE-NEXT: popq %r13 @@ -3498,10 +3522,18 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: pushq %r13 ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx -; CHECK-SSE1-NEXT: movq %rcx, %r12 -; CHECK-SSE1-NEXT: movq %rdx, %r15 +; CHECK-SSE1-NEXT: movq %rcx, %r15 +; CHECK-SSE1-NEXT: movq %rdx, %rbx ; CHECK-SSE1-NEXT: movq %rsi, %r14 -; CHECK-SSE1-NEXT: movq %rdi, %r13 +; CHECK-SSE1-NEXT: movq %rdi, %r12 +; CHECK-SSE1-NEXT: movzbl 19(%rdx), %eax +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 18(%rdx), %eax +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 17(%rdx), %eax +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 16(%rdx), %eax +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 15(%rdx), %eax ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 14(%rdx), %eax @@ -3514,241 +3546,241 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 10(%rdx), %eax ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 9(%rdx), %r8d -; CHECK-SSE1-NEXT: movzbl 8(%rdx), %r9d -; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r10d -; CHECK-SSE1-NEXT: movzbl 6(%rdx), %ebp -; CHECK-SSE1-NEXT: movzbl 5(%rdx), %edi +; CHECK-SSE1-NEXT: movzbl 9(%rdx), %r13d +; CHECK-SSE1-NEXT: movzbl 8(%rdx), %r10d +; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r9d +; CHECK-SSE1-NEXT: movzbl 6(%rdx), %r8d +; CHECK-SSE1-NEXT: movzbl 5(%rdx), %ebp ; CHECK-SSE1-NEXT: movzbl 4(%rdx), %esi -; CHECK-SSE1-NEXT: movzbl 3(%rdx), %eax -; CHECK-SSE1-NEXT: movzbl 2(%rdx), %ecx -; CHECK-SSE1-NEXT: movzbl (%rdx), %r11d -; CHECK-SSE1-NEXT: movzbl 1(%rdx), %edx -; CHECK-SSE1-NEXT: movzbl (%r14), %ebx -; CHECK-SSE1-NEXT: xorb %r11b, %bl -; CHECK-SSE1-NEXT: andb (%r12), %bl -; CHECK-SSE1-NEXT: xorb %r11b, %bl -; CHECK-SSE1-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 1(%r14), %r11d -; CHECK-SSE1-NEXT: xorb %dl, %r11b -; CHECK-SSE1-NEXT: andb 1(%r12), %r11b -; CHECK-SSE1-NEXT: xorb %dl, %r11b +; CHECK-SSE1-NEXT: movzbl 3(%rdx), %edx +; CHECK-SSE1-NEXT: movzbl 2(%rbx), %eax +; CHECK-SSE1-NEXT: movzbl (%rbx), %edi +; CHECK-SSE1-NEXT: movzbl 1(%rbx), %ecx +; CHECK-SSE1-NEXT: movzbl (%r14), %r11d +; CHECK-SSE1-NEXT: xorb %dil, %r11b +; CHECK-SSE1-NEXT: andb (%r15), %r11b +; CHECK-SSE1-NEXT: xorb %dil, %r11b ; CHECK-SSE1-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 2(%r14), %edx -; CHECK-SSE1-NEXT: xorb %cl, %dl -; CHECK-SSE1-NEXT: andb 2(%r12), %dl -; CHECK-SSE1-NEXT: xorb %cl, %dl -; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 3(%r14), %ecx +; CHECK-SSE1-NEXT: movzbl 1(%r14), %edi +; CHECK-SSE1-NEXT: xorb %cl, %dil +; CHECK-SSE1-NEXT: andb 1(%r15), %dil +; CHECK-SSE1-NEXT: xorb %cl, %dil +; CHECK-SSE1-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 2(%r14), %ecx ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 3(%r12), %cl +; CHECK-SSE1-NEXT: andb 2(%r15), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 3(%r14), %eax +; CHECK-SSE1-NEXT: xorb %dl, %al +; CHECK-SSE1-NEXT: andb 3(%r15), %al +; CHECK-SSE1-NEXT: xorb %dl, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 4(%r14), %eax ; CHECK-SSE1-NEXT: xorb %sil, %al -; CHECK-SSE1-NEXT: andb 4(%r12), %al +; CHECK-SSE1-NEXT: andb 4(%r15), %al ; CHECK-SSE1-NEXT: xorb %sil, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 5(%r14), %eax -; CHECK-SSE1-NEXT: xorb %dil, %al -; CHECK-SSE1-NEXT: andb 5(%r12), %al -; CHECK-SSE1-NEXT: xorb %dil, %al -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 6(%r14), %eax ; CHECK-SSE1-NEXT: xorb %bpl, %al -; CHECK-SSE1-NEXT: andb 6(%r12), %al +; CHECK-SSE1-NEXT: andb 5(%r15), %al ; CHECK-SSE1-NEXT: xorb %bpl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 7(%r14), %eax -; CHECK-SSE1-NEXT: xorb %r10b, %al -; CHECK-SSE1-NEXT: andb 7(%r12), %al -; CHECK-SSE1-NEXT: xorb %r10b, %al +; CHECK-SSE1-NEXT: movzbl 6(%r14), %eax +; CHECK-SSE1-NEXT: xorb %r8b, %al +; CHECK-SSE1-NEXT: andb 6(%r15), %al +; CHECK-SSE1-NEXT: xorb %r8b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 8(%r14), %eax +; CHECK-SSE1-NEXT: movzbl 7(%r14), %eax ; CHECK-SSE1-NEXT: xorb %r9b, %al -; CHECK-SSE1-NEXT: andb 8(%r12), %al +; CHECK-SSE1-NEXT: andb 7(%r15), %al ; CHECK-SSE1-NEXT: xorb %r9b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 8(%r14), %eax +; CHECK-SSE1-NEXT: xorb %r10b, %al +; CHECK-SSE1-NEXT: andb 8(%r15), %al +; CHECK-SSE1-NEXT: xorb %r10b, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 9(%r14), %eax -; CHECK-SSE1-NEXT: xorb %r8b, %al -; CHECK-SSE1-NEXT: andb 9(%r12), %al -; CHECK-SSE1-NEXT: xorb %r8b, %al +; CHECK-SSE1-NEXT: xorb %r13b, %al +; CHECK-SSE1-NEXT: andb 9(%r15), %al +; CHECK-SSE1-NEXT: xorb %r13b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 10(%r14), %ecx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 10(%r12), %cl +; CHECK-SSE1-NEXT: andb 10(%r15), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 11(%r14), %ecx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 11(%r12), %cl +; CHECK-SSE1-NEXT: andb 11(%r15), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 12(%r14), %ecx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 12(%r12), %cl +; CHECK-SSE1-NEXT: andb 12(%r15), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 13(%r14), %ecx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 13(%r12), %cl +; CHECK-SSE1-NEXT: andb 13(%r15), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 14(%r14), %ecx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 14(%r12), %cl +; CHECK-SSE1-NEXT: andb 14(%r15), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 15(%r14), %ecx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 15(%r12), %cl +; CHECK-SSE1-NEXT: andb 15(%r15), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 16(%r15), %eax ; CHECK-SSE1-NEXT: movzbl 16(%r14), %ecx +; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 16(%r12), %cl +; CHECK-SSE1-NEXT: andb 16(%r15), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 17(%r15), %eax ; CHECK-SSE1-NEXT: movzbl 17(%r14), %ecx +; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 17(%r12), %cl +; CHECK-SSE1-NEXT: andb 17(%r15), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 18(%r15), %eax ; CHECK-SSE1-NEXT: movzbl 18(%r14), %ecx +; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 18(%r12), %cl +; CHECK-SSE1-NEXT: andb 18(%r15), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 19(%r15), %eax ; CHECK-SSE1-NEXT: movzbl 19(%r14), %ecx +; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 19(%r12), %cl +; CHECK-SSE1-NEXT: andb 19(%r15), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 20(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 20(%rbx), %eax ; CHECK-SSE1-NEXT: movzbl 20(%r14), %ecx ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 20(%r12), %cl +; CHECK-SSE1-NEXT: andb 20(%r15), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 21(%r15), %eax -; CHECK-SSE1-NEXT: movzbl 21(%r14), %ebp +; CHECK-SSE1-NEXT: movzbl 21(%rbx), %eax +; CHECK-SSE1-NEXT: movzbl 21(%r14), %r13d +; CHECK-SSE1-NEXT: xorb %al, %r13b +; CHECK-SSE1-NEXT: andb 21(%r15), %r13b +; CHECK-SSE1-NEXT: xorb %al, %r13b +; CHECK-SSE1-NEXT: movzbl 22(%rbx), %eax +; CHECK-SSE1-NEXT: movzbl 22(%r14), %ebp ; CHECK-SSE1-NEXT: xorb %al, %bpl -; CHECK-SSE1-NEXT: andb 21(%r12), %bpl +; CHECK-SSE1-NEXT: andb 22(%r15), %bpl ; CHECK-SSE1-NEXT: xorb %al, %bpl -; CHECK-SSE1-NEXT: movzbl 22(%r15), %eax -; CHECK-SSE1-NEXT: movzbl 22(%r14), %ebx -; CHECK-SSE1-NEXT: xorb %al, %bl -; CHECK-SSE1-NEXT: andb 22(%r12), %bl -; CHECK-SSE1-NEXT: xorb %al, %bl -; CHECK-SSE1-NEXT: movzbl 23(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 23(%rbx), %eax ; CHECK-SSE1-NEXT: movzbl 23(%r14), %r11d ; CHECK-SSE1-NEXT: xorb %al, %r11b -; CHECK-SSE1-NEXT: andb 23(%r12), %r11b +; CHECK-SSE1-NEXT: andb 23(%r15), %r11b ; CHECK-SSE1-NEXT: xorb %al, %r11b -; CHECK-SSE1-NEXT: movzbl 24(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 24(%rbx), %eax ; CHECK-SSE1-NEXT: movzbl 24(%r14), %r9d ; CHECK-SSE1-NEXT: xorb %al, %r9b -; CHECK-SSE1-NEXT: andb 24(%r12), %r9b +; CHECK-SSE1-NEXT: andb 24(%r15), %r9b ; CHECK-SSE1-NEXT: xorb %al, %r9b -; CHECK-SSE1-NEXT: movzbl 25(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 25(%rbx), %eax ; CHECK-SSE1-NEXT: movzbl 25(%r14), %r8d ; CHECK-SSE1-NEXT: xorb %al, %r8b -; CHECK-SSE1-NEXT: andb 25(%r12), %r8b +; CHECK-SSE1-NEXT: andb 25(%r15), %r8b ; CHECK-SSE1-NEXT: xorb %al, %r8b -; CHECK-SSE1-NEXT: movzbl 26(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 26(%rbx), %eax ; CHECK-SSE1-NEXT: movzbl 26(%r14), %edi ; CHECK-SSE1-NEXT: xorb %al, %dil -; CHECK-SSE1-NEXT: andb 26(%r12), %dil +; CHECK-SSE1-NEXT: andb 26(%r15), %dil ; CHECK-SSE1-NEXT: xorb %al, %dil -; CHECK-SSE1-NEXT: movzbl 27(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 27(%rbx), %eax ; CHECK-SSE1-NEXT: movzbl 27(%r14), %esi ; CHECK-SSE1-NEXT: xorb %al, %sil -; CHECK-SSE1-NEXT: andb 27(%r12), %sil +; CHECK-SSE1-NEXT: andb 27(%r15), %sil ; CHECK-SSE1-NEXT: xorb %al, %sil -; CHECK-SSE1-NEXT: movzbl 28(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 28(%rbx), %eax ; CHECK-SSE1-NEXT: movzbl 28(%r14), %edx ; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: andb 28(%r12), %dl +; CHECK-SSE1-NEXT: andb 28(%r15), %dl ; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: movzbl 29(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 29(%rbx), %eax ; CHECK-SSE1-NEXT: movzbl 29(%r14), %ecx ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 29(%r12), %cl +; CHECK-SSE1-NEXT: andb 29(%r15), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movzbl 30(%r15), %r10d +; CHECK-SSE1-NEXT: movzbl 30(%rbx), %r10d ; CHECK-SSE1-NEXT: movzbl 30(%r14), %eax ; CHECK-SSE1-NEXT: xorb %r10b, %al -; CHECK-SSE1-NEXT: andb 30(%r12), %al +; CHECK-SSE1-NEXT: andb 30(%r15), %al ; CHECK-SSE1-NEXT: xorb %r10b, %al -; CHECK-SSE1-NEXT: movzbl 31(%r15), %r10d -; CHECK-SSE1-NEXT: movzbl 31(%r14), %r14d -; CHECK-SSE1-NEXT: xorb %r10b, %r14b -; CHECK-SSE1-NEXT: andb 31(%r12), %r14b -; CHECK-SSE1-NEXT: xorb %r10b, %r14b -; CHECK-SSE1-NEXT: movb %r14b, 31(%r13) -; CHECK-SSE1-NEXT: movb %al, 30(%r13) -; CHECK-SSE1-NEXT: movb %cl, 29(%r13) -; CHECK-SSE1-NEXT: movb %dl, 28(%r13) -; CHECK-SSE1-NEXT: movb %sil, 27(%r13) -; CHECK-SSE1-NEXT: movb %dil, 26(%r13) -; CHECK-SSE1-NEXT: movb %r8b, 25(%r13) -; CHECK-SSE1-NEXT: movb %r9b, 24(%r13) -; CHECK-SSE1-NEXT: movb %r11b, 23(%r13) -; CHECK-SSE1-NEXT: movb %bl, 22(%r13) -; CHECK-SSE1-NEXT: movb %bpl, 21(%r13) +; CHECK-SSE1-NEXT: movzbl 31(%rbx), %r10d +; CHECK-SSE1-NEXT: movzbl 31(%r14), %ebx +; CHECK-SSE1-NEXT: xorb %r10b, %bl +; CHECK-SSE1-NEXT: andb 31(%r15), %bl +; CHECK-SSE1-NEXT: xorb %r10b, %bl +; CHECK-SSE1-NEXT: movb %bl, 31(%r12) +; CHECK-SSE1-NEXT: movb %al, 30(%r12) +; CHECK-SSE1-NEXT: movb %cl, 29(%r12) +; CHECK-SSE1-NEXT: movb %dl, 28(%r12) +; CHECK-SSE1-NEXT: movb %sil, 27(%r12) +; CHECK-SSE1-NEXT: movb %dil, 26(%r12) +; CHECK-SSE1-NEXT: movb %r8b, 25(%r12) +; CHECK-SSE1-NEXT: movb %r9b, 24(%r12) +; CHECK-SSE1-NEXT: movb %r11b, 23(%r12) +; CHECK-SSE1-NEXT: movb %bpl, 22(%r12) +; CHECK-SSE1-NEXT: movb %r13b, 21(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 20(%r13) +; CHECK-SSE1-NEXT: movb %al, 20(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 19(%r13) +; CHECK-SSE1-NEXT: movb %al, 19(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 18(%r13) +; CHECK-SSE1-NEXT: movb %al, 18(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 17(%r13) +; CHECK-SSE1-NEXT: movb %al, 17(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 16(%r13) +; CHECK-SSE1-NEXT: movb %al, 16(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 15(%r13) +; CHECK-SSE1-NEXT: movb %al, 15(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 14(%r13) +; CHECK-SSE1-NEXT: movb %al, 14(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 13(%r13) +; CHECK-SSE1-NEXT: movb %al, 13(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 12(%r13) +; CHECK-SSE1-NEXT: movb %al, 12(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 11(%r13) +; CHECK-SSE1-NEXT: movb %al, 11(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 10(%r13) +; CHECK-SSE1-NEXT: movb %al, 10(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 9(%r13) +; CHECK-SSE1-NEXT: movb %al, 9(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 8(%r13) +; CHECK-SSE1-NEXT: movb %al, 8(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 7(%r13) +; CHECK-SSE1-NEXT: movb %al, 7(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 6(%r13) +; CHECK-SSE1-NEXT: movb %al, 6(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 5(%r13) +; CHECK-SSE1-NEXT: movb %al, 5(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 4(%r13) +; CHECK-SSE1-NEXT: movb %al, 4(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 3(%r13) +; CHECK-SSE1-NEXT: movb %al, 3(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 2(%r13) +; CHECK-SSE1-NEXT: movb %al, 2(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 1(%r13) +; CHECK-SSE1-NEXT: movb %al, 1(%r12) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, (%r13) -; CHECK-SSE1-NEXT: movq %r13, %rax +; CHECK-SSE1-NEXT: movb %al, (%r12) +; CHECK-SSE1-NEXT: movq %r12, %rax ; CHECK-SSE1-NEXT: popq %rbx ; CHECK-SSE1-NEXT: popq %r12 ; CHECK-SSE1-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/x86-64-flags-intrinsics.ll b/llvm/test/CodeGen/X86/x86-64-flags-intrinsics.ll index e3bc77d4d5fa2..d9dc117397b4a 100644 --- a/llvm/test/CodeGen/X86/x86-64-flags-intrinsics.ll +++ b/llvm/test/CodeGen/X86/x86-64-flags-intrinsics.ll @@ -91,11 +91,11 @@ define i64 @read_flags_reg_pressure() nounwind { ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rbp ; WIN64-NEXT: #APP ; WIN64-NEXT: #NO_APP -; WIN64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WIN64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; WIN64-NEXT: pushfq -; WIN64-NEXT: popq %rdx -; WIN64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; WIN64-NEXT: popq %rcx +; WIN64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; WIN64-NEXT: #APP ; WIN64-NEXT: #NO_APP ; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload @@ -168,11 +168,11 @@ define void @write_flags_reg_pressure(i64 noundef %0) nounwind { ; WIN64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; WIN64-NEXT: #APP ; WIN64-NEXT: #NO_APP -; WIN64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; WIN64-NEXT: pushq %rdx +; WIN64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; WIN64-NEXT: pushq %rcx ; WIN64-NEXT: popfq -; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; WIN64-NEXT: #APP ; WIN64-NEXT: #NO_APP ; WIN64-NEXT: addq $16, %rsp diff --git a/llvm/test/TableGen/bare-minimum-psets.td b/llvm/test/TableGen/bare-minimum-psets.td index 25e0bd2a83d1d..170838dd5f01c 100644 --- a/llvm/test/TableGen/bare-minimum-psets.td +++ b/llvm/test/TableGen/bare-minimum-psets.td @@ -55,7 +55,7 @@ def MyTarget : Target; // CHECK-NEXT: } // CHECK: unsigned MyTargetGenRegisterInfo:: -// CHECK-NEXT: getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const { +// CHECK-NEXT: getRawRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const { // CHECK-NEXT: static const uint8_t PressureLimitTable[] = { // CHECK-NEXT: {{[0-9]+}}, // 0: D_32 // CHECK-NEXT: }; diff --git a/llvm/test/TableGen/inhibit-pset.td b/llvm/test/TableGen/inhibit-pset.td index 1f4f8a176c62c..b3443f1938e8c 100644 --- a/llvm/test/TableGen/inhibit-pset.td +++ b/llvm/test/TableGen/inhibit-pset.td @@ -15,7 +15,7 @@ def X0 : Register <"x0">; // CHECK-NEXT: } // CHECK: unsigned TestTargetGenRegisterInfo:: -// CHECK-NEXT: getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const { +// CHECK-NEXT: getRawRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const { // CHECK-NEXT: static const uint16_t PressureLimitTable[] = { // CHECK-NEXT: {{[0-9]+}}, // 0: GPR32 // CHECK-NEXT: }; diff --git a/llvm/unittests/CodeGen/MFCommon.inc b/llvm/unittests/CodeGen/MFCommon.inc index 67759bd5c4632..08bb859dee40c 100644 --- a/llvm/unittests/CodeGen/MFCommon.inc +++ b/llvm/unittests/CodeGen/MFCommon.inc @@ -50,8 +50,8 @@ public: const char *getRegPressureSetName(unsigned Idx) const override { return "bogus"; } - unsigned getRegPressureSetLimit(const MachineFunction &MF, - unsigned Idx) const override { + unsigned getRawRegPressureSetLimit(const MachineFunction &MF, + unsigned Idx) const override { return 0; } const int * diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp index a6f87119aca5b..79c2eb2acc564 100644 --- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp +++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp @@ -275,8 +275,8 @@ void RegisterInfoEmitter::EmitRegUnitPressure(raw_ostream &OS, OS << "// Get the register unit pressure limit for this dimension.\n" << "// This limit must be adjusted dynamically for reserved registers.\n" << "unsigned " << ClassName << "::\n" - << "getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const " - "{\n" + << "getRawRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) " + "const {\n" << " static const " << getMinimalTypeForRange(MaxRegUnitWeight, 32) << " PressureLimitTable[] = {\n"; for (unsigned i = 0; i < NumSets; ++i) { @@ -1129,7 +1129,8 @@ void RegisterInfoEmitter::runTargetHeader(raw_ostream &OS) { << " unsigned getRegUnitWeight(unsigned RegUnit) const override;\n" << " unsigned getNumRegPressureSets() const override;\n" << " const char *getRegPressureSetName(unsigned Idx) const override;\n" - << " unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned " + << " unsigned getRawRegPressureSetLimit(const MachineFunction &MF, " + "unsigned " "Idx) const override;\n" << " const int *getRegClassPressureSets(" << "const TargetRegisterClass *RC) const override;\n" From fae615d7832d99458c98ed6d170f22c9b46d872c Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Thu, 5 Dec 2024 22:00:19 +0800 Subject: [PATCH 2/6] Use MCPhysReg Created using spr 1.3.6-beta.1 --- llvm/lib/CodeGen/TargetRegisterInfo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp index 4cede283a7232..a5bb90d511d6a 100644 --- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp +++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp @@ -699,7 +699,7 @@ unsigned TargetRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, unsigned NReserved = 0; const BitVector Reserved = MF.getRegInfo().getReservedRegs(); - for (unsigned PhysReg : RC->getRawAllocationOrder(MF)) + for (MCPhysReg PhysReg : RC->getRawAllocationOrder(MF)) if (Reserved.test(PhysReg)) NReserved++; From 723597a756c005734c72cb98a543def099456c47 Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Fri, 6 Dec 2024 11:15:37 +0800 Subject: [PATCH 3/6] Add new line Created using spr 1.3.6-beta.1 --- llvm/lib/CodeGen/TargetRegisterInfo.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp index a5bb90d511d6a..a7c1b29277c7f 100644 --- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp +++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp @@ -710,9 +710,8 @@ unsigned TargetRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, // Avoid returning zero, RegisterClassInfo::getRegPressureSetLimit(Idx) // assumes this returns non-zero value. if (NAllocatableRegs == 0) { - LLVM_DEBUG({ - dbgs() << "All registers of " << getRegClassName(RC) << " are reserved!"; - }); + LLVM_DEBUG(dbgs() << "All registers of " << getRegClassName(RC) + << " are reserved!\n";); return RegPressureSetLimit; } return RegPressureSetLimit - getRegClassWeight(RC).RegWeight * NReserved; From 5adec359eeb888d35dd836b79f43c710f6aa98ee Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Fri, 6 Dec 2024 15:14:40 +0800 Subject: [PATCH 4/6] Add a comment to suggest the user to not use TargetRegisterInfo::getRegPressureSetLimit directly Created using spr 1.3.6-beta.1 --- llvm/include/llvm/CodeGen/TargetRegisterInfo.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h index f7cd7cfe1aa15..c9b61f9207a39 100644 --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -914,6 +914,8 @@ class TargetRegisterInfo : public MCRegisterInfo { /// Get the register unit pressure limit for this dimension. /// TargetRegisterInfo adjusts this limit for reserved registers. + /// Avoid using this method directly as it is costly to compute. Use the + /// cached version `RegisterClassInfo::getRegPressureSetLimit` instead. virtual unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const; From 9737ec845e1c79af1e7afbb58c62b318dd9c3cac Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Tue, 10 Dec 2024 11:59:16 +0800 Subject: [PATCH 5/6] Use cached version of getRegPressureSetLimit Created using spr 1.3.6-beta.1 --- llvm/lib/CodeGen/MachineLICM.cpp | 4 +++- llvm/lib/CodeGen/MachinePipeliner.cpp | 2 +- llvm/lib/CodeGen/MachineSink.cpp | 2 +- llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp | 3 +-- llvm/lib/Target/PowerPC/PPCInstrInfo.cpp | 4 ++-- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index d21059189b184..8aaa5605f28b7 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -123,6 +123,7 @@ namespace { const TargetRegisterInfo *TRI = nullptr; const MachineFrameInfo *MFI = nullptr; MachineRegisterInfo *MRI = nullptr; + RegisterClassInfo RegClassInfo; TargetSchedModel SchedModel; bool PreRegAlloc = false; bool HasProfileData = false; @@ -389,6 +390,7 @@ bool MachineLICMImpl::run(MachineFunction &MF) { MFI = &MF.getFrameInfo(); MRI = &MF.getRegInfo(); SchedModel.init(&ST); + RegClassInfo.runOnMachineFunction(MF); HasProfileData = MF.getFunction().hasProfileData(); @@ -405,7 +407,7 @@ bool MachineLICMImpl::run(MachineFunction &MF) { std::fill(RegPressure.begin(), RegPressure.end(), 0); RegLimit.resize(NumRPS); for (unsigned i = 0, e = NumRPS; i != e; ++i) - RegLimit[i] = TRI->getRegPressureSetLimit(MF, i); + RegLimit[i] = RegClassInfo.getRegPressureSetLimit(i); } if (HoistConstLoads) diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index 3ee0ba1fea507..e2bbebfc5f546 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -1326,7 +1326,7 @@ class HighRegisterPressureDetector { // Calculate the upper limit of each pressure set void computePressureSetLimit(const RegisterClassInfo &RCI) { for (unsigned PSet = 0; PSet < PSetNum; PSet++) - PressureSetLimit[PSet] = TRI->getRegPressureSetLimit(MF, PSet); + PressureSetLimit[PSet] = RCI.getRegPressureSetLimit(PSet); } // There are two patterns of last-use. diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp index 7d0bedab7cdab..d407d8a965ea6 100644 --- a/llvm/lib/CodeGen/MachineSink.cpp +++ b/llvm/lib/CodeGen/MachineSink.cpp @@ -1094,7 +1094,7 @@ bool MachineSinking::registerPressureSetExceedsLimit( std::vector BBRegisterPressure = getBBRegisterPressure(MBB); for (; *PS != -1; PS++) if (Weight + BBRegisterPressure[*PS] >= - TRI->getRegPressureSetLimit(*MBB.getParent(), *PS)) + RegClassInfo.getRegPressureSetLimit(*PS)) return true; return false; } diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index e6b37dd916168..8673deddb7057 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -6936,7 +6936,6 @@ bool ARMPipelinerLoopInfo::tooMuchRegisterPressure(SwingSchedulerDAG &SSD, RegClassInfo.runOnMachineFunction(*MF); RPTracker.init(MF, &RegClassInfo, nullptr, EndLoop->getParent(), EndLoop->getParent()->end(), false, false); - const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); bumpCrossIterationPressure(RPTracker, CrossIterationNeeds); @@ -6979,7 +6978,7 @@ bool ARMPipelinerLoopInfo::tooMuchRegisterPressure(SwingSchedulerDAG &SSD, auto &P = RPTracker.getPressure().MaxSetPressure; for (unsigned I = 0, E = P.size(); I < E; ++I) - if (P[I] > TRI->getRegPressureSetLimit(*MF, I)) { + if (P[I] > RegClassInfo.getRegPressureSetLimit(I)) { return true; } return false; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 44f6db5061e21..fa45a7fb7fabe 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -643,8 +643,8 @@ bool PPCInstrInfo::shouldReduceRegisterPressure( }; // For now we only care about float and double type fma. - unsigned VSSRCLimit = TRI->getRegPressureSetLimit( - *MBB->getParent(), PPC::RegisterPressureSets::VSSRC); + unsigned VSSRCLimit = + RegClassInfo->getRegPressureSetLimit(PPC::RegisterPressureSets::VSSRC); // Only reduce register pressure when pressure is high. return GetMBBPressure(MBB)[PPC::RegisterPressureSets::VSSRC] > From 47c327525234273de372c3294c7aa74173eba5b6 Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Fri, 10 Jan 2025 16:59:55 +0800 Subject: [PATCH 6/6] Fix X86 regrssions Created using spr 1.3.6-beta.1 --- llvm/lib/CodeGen/TargetRegisterInfo.cpp | 10 +- llvm/test/CodeGen/PowerPC/aix64-csr-alloc.mir | 1 + llvm/test/CodeGen/X86/avx512-regcall-Mask.ll | 4 +- .../test/CodeGen/X86/avx512-regcall-NoMask.ll | 8 +- llvm/test/CodeGen/X86/sse-regcall.ll | 8 +- llvm/test/CodeGen/X86/sse-regcall4.ll | 8 +- .../subvectorwise-store-of-vector-splat.ll | 335 ++++++----- ...unfold-masked-merge-vector-variablemask.ll | 556 +++++++++--------- .../CodeGen/X86/x86-64-flags-intrinsics.ll | 16 +- 9 files changed, 456 insertions(+), 490 deletions(-) diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp index 73cf1c4d9a5ba..4da1d5920894b 100644 --- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp +++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp @@ -738,13 +738,12 @@ unsigned TargetRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, } assert(RC && "Failed to find register class"); - unsigned NReserved = 0; - const BitVector Reserved = MF.getRegInfo().getReservedRegs(); + unsigned NAllocatableRegs = 0; + const BitVector &Reserved = MF.getRegInfo().getReservedRegs(); for (MCPhysReg PhysReg : RC->getRawAllocationOrder(MF)) - if (Reserved.test(PhysReg)) - NReserved++; + if (!Reserved.test(PhysReg)) + NAllocatableRegs++; - unsigned NAllocatableRegs = RC->getNumRegs() - NReserved; unsigned RegPressureSetLimit = getRawRegPressureSetLimit(MF, Idx); // If all the regs are reserved, return raw RegPressureSetLimit. // One example is VRSAVERC in PowerPC. @@ -755,6 +754,7 @@ unsigned TargetRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, << " are reserved!\n";); return RegPressureSetLimit; } + unsigned NReserved = RC->getNumRegs() - NAllocatableRegs; return RegPressureSetLimit - getRegClassWeight(RC).RegWeight * NReserved; } diff --git a/llvm/test/CodeGen/PowerPC/aix64-csr-alloc.mir b/llvm/test/CodeGen/PowerPC/aix64-csr-alloc.mir index 3617b95b2a6af..2f93a0ddcfb67 100644 --- a/llvm/test/CodeGen/PowerPC/aix64-csr-alloc.mir +++ b/llvm/test/CodeGen/PowerPC/aix64-csr-alloc.mir @@ -17,4 +17,5 @@ body: | BLR8 implicit $lr8, implicit undef $rm, implicit $x3, implicit $f1 ... # CHECK-DAG: AllocationOrder(G8RC_and_G8RC_NOX0) = [ $x3 $x4 $x5 $x6 $x7 $x8 $x9 $x10 $x11 $x12 $x2 $x31 $x30 $x29 $x28 $x27 $x26 $x25 $x24 $x23 $x22 $x21 $x20 $x19 $x18 $x17 $x16 $x15 $x14 ] +# CHECK-DAG: AllocationOrder(G8RC) = [ $x3 $x4 $x5 $x6 $x7 $x8 $x9 $x10 $x11 $x12 $x0 $x2 $x31 $x30 $x29 $x28 $x27 $x26 $x25 $x24 $x23 $x22 $x21 $x20 $x19 $x18 $x17 $x16 $x15 $x14 ] # CHECK-DAG: AllocationOrder(F8RC) = [ $f0 $f1 $f2 $f3 $f4 $f5 $f6 $f7 $f8 $f9 $f10 $f11 $f12 $f13 $f31 $f30 $f29 $f28 $f27 $f26 $f25 $f24 $f23 $f22 $f21 $f20 $f19 $f18 $f17 $f16 $f15 $f14 ] diff --git a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll index 85d302abfd1ae..b3a0c7dffae11 100644 --- a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll +++ b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll @@ -138,7 +138,6 @@ define dso_local i64 @caller_argv64i1() #0 { ; WIN64-NEXT: movq %rax, %rcx ; WIN64-NEXT: movq %rax, %rdx ; WIN64-NEXT: movq %rax, %rdi -; WIN64-NEXT: movq %rax, %rsi ; WIN64-NEXT: movq %rax, %r8 ; WIN64-NEXT: movq %rax, %r9 ; WIN64-NEXT: movq %rax, %r10 @@ -146,6 +145,7 @@ define dso_local i64 @caller_argv64i1() #0 { ; WIN64-NEXT: movq %rax, %r12 ; WIN64-NEXT: movq %rax, %r14 ; WIN64-NEXT: movq %rax, %r15 +; WIN64-NEXT: movq %rax, %rsi ; WIN64-NEXT: callq test_argv64i1 ; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload @@ -178,13 +178,13 @@ define dso_local i64 @caller_argv64i1() #0 { ; LINUXOSX64-NEXT: movq %rax, %rcx ; LINUXOSX64-NEXT: movq %rax, %rdx ; LINUXOSX64-NEXT: movq %rax, %rdi -; LINUXOSX64-NEXT: movq %rax, %rsi ; LINUXOSX64-NEXT: movq %rax, %r8 ; LINUXOSX64-NEXT: movq %rax, %r9 ; LINUXOSX64-NEXT: movq %rax, %r12 ; LINUXOSX64-NEXT: movq %rax, %r13 ; LINUXOSX64-NEXT: movq %rax, %r14 ; LINUXOSX64-NEXT: movq %rax, %r15 +; LINUXOSX64-NEXT: movq %rax, %rsi ; LINUXOSX64-NEXT: pushq %rax ; LINUXOSX64-NEXT: .cfi_adjust_cfa_offset 8 ; LINUXOSX64-NEXT: pushq %rax diff --git a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll index 3aed15caa4ea7..2081d201704f3 100644 --- a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll +++ b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll @@ -987,6 +987,8 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 % ; WIN64: # %bb.0: ; WIN64-NEXT: pushq %rbp ; WIN64-NEXT: pushq %rbx +; WIN64-NEXT: # kill: def $edx killed $edx def $rdx +; WIN64-NEXT: # kill: def $esi killed $esi def $rsi ; WIN64-NEXT: # kill: def $r15d killed $r15d def $r15 ; WIN64-NEXT: # kill: def $r14d killed $r14d def $r14 ; WIN64-NEXT: # kill: def $r12d killed $r12d def $r12 @@ -994,9 +996,7 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 % ; WIN64-NEXT: # kill: def $r10d killed $r10d def $r10 ; WIN64-NEXT: # kill: def $r9d killed $r9d def $r9 ; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8 -; WIN64-NEXT: # kill: def $esi killed $esi def $rsi ; WIN64-NEXT: # kill: def $edi killed $edi def $rdi -; WIN64-NEXT: # kill: def $edx killed $edx def $rdx ; WIN64-NEXT: leal (%rdx,%rdi), %ebx ; WIN64-NEXT: movl %edx, %ebp ; WIN64-NEXT: subl %edi, %ebp @@ -1032,14 +1032,14 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 % ; ; LINUXOSX64-LABEL: testi32_inp: ; LINUXOSX64: # %bb.0: +; LINUXOSX64-NEXT: # kill: def $edx killed $edx def $rdx +; LINUXOSX64-NEXT: # kill: def $esi killed $esi def $rsi ; LINUXOSX64-NEXT: # kill: def $r14d killed $r14d def $r14 ; LINUXOSX64-NEXT: # kill: def $r13d killed $r13d def $r13 ; LINUXOSX64-NEXT: # kill: def $r12d killed $r12d def $r12 ; LINUXOSX64-NEXT: # kill: def $r9d killed $r9d def $r9 ; LINUXOSX64-NEXT: # kill: def $r8d killed $r8d def $r8 -; LINUXOSX64-NEXT: # kill: def $esi killed $esi def $rsi ; LINUXOSX64-NEXT: # kill: def $edi killed $edi def $rdi -; LINUXOSX64-NEXT: # kill: def $edx killed $edx def $rdx ; LINUXOSX64-NEXT: leal (%rdx,%rdi), %r10d ; LINUXOSX64-NEXT: movl %edx, %r11d ; LINUXOSX64-NEXT: subl %edi, %r11d diff --git a/llvm/test/CodeGen/X86/sse-regcall.ll b/llvm/test/CodeGen/X86/sse-regcall.ll index e014c9f895383..6f0293392eef2 100644 --- a/llvm/test/CodeGen/X86/sse-regcall.ll +++ b/llvm/test/CodeGen/X86/sse-regcall.ll @@ -244,6 +244,8 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; WIN64: # %bb.0: ; WIN64-NEXT: pushq %rbp ; WIN64-NEXT: pushq %rbx +; WIN64-NEXT: # kill: def $edx killed $edx def $rdx +; WIN64-NEXT: # kill: def $esi killed $esi def $rsi ; WIN64-NEXT: # kill: def $r15d killed $r15d def $r15 ; WIN64-NEXT: # kill: def $r14d killed $r14d def $r14 ; WIN64-NEXT: # kill: def $r12d killed $r12d def $r12 @@ -251,9 +253,7 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; WIN64-NEXT: # kill: def $r10d killed $r10d def $r10 ; WIN64-NEXT: # kill: def $r9d killed $r9d def $r9 ; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8 -; WIN64-NEXT: # kill: def $esi killed $esi def $rsi ; WIN64-NEXT: # kill: def $edi killed $edi def $rdi -; WIN64-NEXT: # kill: def $edx killed $edx def $rdx ; WIN64-NEXT: leal (%rdx,%rdi), %ebx ; WIN64-NEXT: movl %edx, %ebp ; WIN64-NEXT: subl %edi, %ebp @@ -289,14 +289,14 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; ; LINUXOSX-LABEL: testi32_inp: ; LINUXOSX: # %bb.0: +; LINUXOSX-NEXT: # kill: def $edx killed $edx def $rdx +; LINUXOSX-NEXT: # kill: def $esi killed $esi def $rsi ; LINUXOSX-NEXT: # kill: def $r14d killed $r14d def $r14 ; LINUXOSX-NEXT: # kill: def $r13d killed $r13d def $r13 ; LINUXOSX-NEXT: # kill: def $r12d killed $r12d def $r12 ; LINUXOSX-NEXT: # kill: def $r9d killed $r9d def $r9 ; LINUXOSX-NEXT: # kill: def $r8d killed $r8d def $r8 -; LINUXOSX-NEXT: # kill: def $esi killed $esi def $rsi ; LINUXOSX-NEXT: # kill: def $edi killed $edi def $rdi -; LINUXOSX-NEXT: # kill: def $edx killed $edx def $rdx ; LINUXOSX-NEXT: leal (%rdx,%rdi), %r10d ; LINUXOSX-NEXT: movl %edx, %r11d ; LINUXOSX-NEXT: subl %edi, %r11d diff --git a/llvm/test/CodeGen/X86/sse-regcall4.ll b/llvm/test/CodeGen/X86/sse-regcall4.ll index f66f9d9d44942..c8df7a233d7e3 100644 --- a/llvm/test/CodeGen/X86/sse-regcall4.ll +++ b/llvm/test/CodeGen/X86/sse-regcall4.ll @@ -244,14 +244,14 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; WIN64: # %bb.0: ; WIN64-NEXT: pushq %rbp ; WIN64-NEXT: pushq %rbx +; WIN64-NEXT: # kill: def $edx killed $edx def $rdx +; WIN64-NEXT: # kill: def $esi killed $esi def $rsi ; WIN64-NEXT: # kill: def $r14d killed $r14d def $r14 ; WIN64-NEXT: # kill: def $r12d killed $r12d def $r12 ; WIN64-NEXT: # kill: def $r11d killed $r11d def $r11 ; WIN64-NEXT: # kill: def $r9d killed $r9d def $r9 ; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8 -; WIN64-NEXT: # kill: def $esi killed $esi def $rsi ; WIN64-NEXT: # kill: def $edi killed $edi def $rdi -; WIN64-NEXT: # kill: def $edx killed $edx def $rdx ; WIN64-NEXT: leal (%rdx,%rdi), %ebx ; WIN64-NEXT: movl %edx, %ebp ; WIN64-NEXT: subl %edi, %ebp @@ -288,14 +288,14 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a ; ; LINUXOSX-LABEL: testi32_inp: ; LINUXOSX: # %bb.0: +; LINUXOSX-NEXT: # kill: def $edx killed $edx def $rdx +; LINUXOSX-NEXT: # kill: def $esi killed $esi def $rsi ; LINUXOSX-NEXT: # kill: def $r14d killed $r14d def $r14 ; LINUXOSX-NEXT: # kill: def $r13d killed $r13d def $r13 ; LINUXOSX-NEXT: # kill: def $r12d killed $r12d def $r12 ; LINUXOSX-NEXT: # kill: def $r9d killed $r9d def $r9 ; LINUXOSX-NEXT: # kill: def $r8d killed $r8d def $r8 -; LINUXOSX-NEXT: # kill: def $esi killed $esi def $rsi ; LINUXOSX-NEXT: # kill: def $edi killed $edi def $rdi -; LINUXOSX-NEXT: # kill: def $edx killed $edx def $rdx ; LINUXOSX-NEXT: leal (%rdx,%rdi), %r10d ; LINUXOSX-NEXT: movl %edx, %r11d ; LINUXOSX-NEXT: subl %edi, %r11d diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll index 20d3f20c1a149..f1fd05565c47e 100644 --- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll +++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll @@ -389,7 +389,7 @@ define void @vec128_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec128_v2i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) @@ -452,7 +452,7 @@ define void @vec128_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec128_v2f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) @@ -599,7 +599,7 @@ define void @vec128_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec128_v4i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) @@ -694,7 +694,7 @@ define void @vec128_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX512-LABEL: vec128_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) @@ -1003,7 +1003,7 @@ define void @vec256_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec256_v2i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -1079,7 +1079,7 @@ define void @vec256_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec256_v2f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -1355,7 +1355,7 @@ define void @vec256_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec256_v4i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -1550,7 +1550,7 @@ define void @vec256_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX512-LABEL: vec256_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -2170,7 +2170,7 @@ define void @vec384_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec384_v2i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -2258,7 +2258,7 @@ define void @vec384_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec384_v2f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -2722,7 +2722,7 @@ define void @vec384_v3i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX512-LABEL: vec384_v3i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vpextrb $2, %xmm0, 2(%rsi) ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: movw %ax, (%rsi) @@ -3006,7 +3006,7 @@ define void @vec384_v3i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec384_v3i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rsi) ; AVX512-NEXT: vmovd %xmm0, (%rsi) ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rdx) @@ -3664,7 +3664,7 @@ define void @vec384_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec384_v4i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -3983,7 +3983,7 @@ define void @vec384_v6i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX512-LABEL: vec384_v6i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rsi) ; AVX512-NEXT: vmovd %xmm0, (%rsi) ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rdx) @@ -4420,7 +4420,7 @@ define void @vec384_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX512-LABEL: vec384_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx) @@ -5444,7 +5444,7 @@ define void @vec512_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec512_v2i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -5540,7 +5540,7 @@ define void @vec512_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec512_v2f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -5965,7 +5965,7 @@ define void @vec512_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; AVX512-LABEL: vec512_v4i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -6363,7 +6363,7 @@ define void @vec512_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX512-LABEL: vec512_v8i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, (%rsi) ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -7079,14 +7079,6 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: pushq %r13 ; SCALAR-NEXT: pushq %r12 ; SCALAR-NEXT: pushq %rbx -; SCALAR-NEXT: movzbl 20(%rdi), %eax -; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 19(%rdi), %eax -; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 18(%rdi), %eax -; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 17(%rdi), %eax -; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 16(%rdi), %eax ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 15(%rdi), %eax @@ -7095,9 +7087,9 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 13(%rdi), %eax ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 12(%rdi), %eax +; SCALAR-NEXT: movzbl 12(%rdi), %r13d +; SCALAR-NEXT: movzbl 11(%rdi), %eax ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 11(%rdi), %r13d ; SCALAR-NEXT: movzbl 10(%rdi), %r12d ; SCALAR-NEXT: movzbl 9(%rdi), %r15d ; SCALAR-NEXT: movzbl 8(%rdi), %r14d @@ -7131,51 +7123,55 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r12b ; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload +; SCALAR-NEXT: notb %r11b +; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r13b ; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill ; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SCALAR-NEXT: notb %r12b -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SCALAR-NEXT: notb %r10b -; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload ; SCALAR-NEXT: notb %r8b -; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 21(%rdi), %eax +; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; SCALAR-NEXT: movzbl 17(%rdi), %eax ; SCALAR-NEXT: notb %al ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 22(%rdi), %ebx -; SCALAR-NEXT: notb %bl -; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 23(%rdi), %eax +; SCALAR-NEXT: movzbl 18(%rdi), %eax +; SCALAR-NEXT: notb %al +; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 19(%rdi), %eax ; SCALAR-NEXT: notb %al ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 24(%rdi), %ebp +; SCALAR-NEXT: movzbl 20(%rdi), %eax +; SCALAR-NEXT: notb %al +; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 21(%rdi), %ebp ; SCALAR-NEXT: notb %bpl ; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 25(%rdi), %r11d -; SCALAR-NEXT: notb %r11b -; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 22(%rdi), %ebx +; SCALAR-NEXT: notb %bl +; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 23(%rdi), %r10d +; SCALAR-NEXT: notb %r10b +; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 24(%rdi), %r9d +; SCALAR-NEXT: notb %r9b +; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 25(%rdi), %ecx +; SCALAR-NEXT: notb %cl +; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 26(%rdi), %r14d ; SCALAR-NEXT: notb %r14b ; SCALAR-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 27(%rdi), %r15d ; SCALAR-NEXT: notb %r15b ; SCALAR-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 28(%rdi), %r13d +; SCALAR-NEXT: movzbl 28(%rdi), %r12d +; SCALAR-NEXT: notb %r12b +; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 29(%rdi), %r13d ; SCALAR-NEXT: notb %r13b ; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 29(%rdi), %ecx -; SCALAR-NEXT: notb %cl -; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 30(%rdi), %eax ; SCALAR-NEXT: notb %al ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill @@ -7184,56 +7180,57 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movb %dil, 31(%rsi) ; SCALAR-NEXT: movb %al, 30(%rsi) -; SCALAR-NEXT: movb %cl, 29(%rsi) -; SCALAR-NEXT: movb %r13b, 28(%rsi) +; SCALAR-NEXT: movb %r13b, 29(%rsi) +; SCALAR-NEXT: movb %r12b, 28(%rsi) ; SCALAR-NEXT: movb %r15b, 27(%rsi) ; SCALAR-NEXT: movb %r14b, 26(%rsi) -; SCALAR-NEXT: movb %r11b, 25(%rsi) -; SCALAR-NEXT: movb %bpl, 24(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SCALAR-NEXT: movb %bpl, 23(%rsi) +; SCALAR-NEXT: movb %cl, 25(%rsi) +; SCALAR-NEXT: movb %r9b, 24(%rsi) +; SCALAR-NEXT: movb %r10b, 23(%rsi) ; SCALAR-NEXT: movb %bl, 22(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 21(%rsi) -; SCALAR-NEXT: movb %r8b, 20(%rsi) -; SCALAR-NEXT: movb %r9b, 19(%rsi) -; SCALAR-NEXT: movb %r10b, 18(%rsi) -; SCALAR-NEXT: movb %r12b, 17(%rsi) -; SCALAR-NEXT: movl %r12d, %r15d -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r12b, 16(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r14b, 15(%rsi) +; SCALAR-NEXT: movb %bpl, 21(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SCALAR-NEXT: movb %bpl, 20(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 14(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 13(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 12(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 11(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 10(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r10b, 9(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 8(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r9b, 7(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r13b, 6(%rsi) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r8b, 5(%rsi) +; SCALAR-NEXT: movb %al, 19(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 18(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 17(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 4(%rsi) +; SCALAR-NEXT: movb %cl, 16(%rsi) +; SCALAR-NEXT: movb %r8b, 15(%rsi) +; SCALAR-NEXT: movl %r8d, %r14d +; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload +; SCALAR-NEXT: movb %bl, 14(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 13(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 12(%rsi) +; SCALAR-NEXT: movb %r11b, 11(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SCALAR-NEXT: movb %dil, 3(%rsi) +; SCALAR-NEXT: movb %dil, 10(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SCALAR-NEXT: movb %dil, 2(%rsi) +; SCALAR-NEXT: movb %dil, 9(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload -; SCALAR-NEXT: movb %dil, 1(%rsi) +; SCALAR-NEXT: movb %dil, 8(%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r11b, (%rsi) +; SCALAR-NEXT: movb %r11b, 7(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r13b, 6(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r10b, 5(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r12b, 4(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r9b, 3(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r15b, 2(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r8b, 1(%rsi) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SCALAR-NEXT: movb %dil, (%rsi) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; SCALAR-NEXT: movb %sil, 31(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload @@ -7250,92 +7247,92 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movb %sil, 25(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; SCALAR-NEXT: movb %sil, 24(%rdx) -; SCALAR-NEXT: movb %bpl, 23(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SCALAR-NEXT: movb %sil, 23(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; SCALAR-NEXT: movb %sil, 22(%rdx) -; SCALAR-NEXT: movb %bl, 21(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 20(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 19(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload -; SCALAR-NEXT: movb %bl, 18(%rdx) -; SCALAR-NEXT: movb %r15b, 17(%rdx) -; SCALAR-NEXT: movb %r12b, 16(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SCALAR-NEXT: movb %sil, 21(%rdx) +; SCALAR-NEXT: movb %bpl, 20(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SCALAR-NEXT: movb %sil, 19(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SCALAR-NEXT: movb %sil, 18(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SCALAR-NEXT: movb %sil, 17(%rdx) +; SCALAR-NEXT: movb %cl, 16(%rdx) ; SCALAR-NEXT: movb %r14b, 15(%rdx) -; SCALAR-NEXT: movb %al, 14(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload -; SCALAR-NEXT: movb %bpl, 13(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %bl, 14(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload +; SCALAR-NEXT: movb %cl, 13(%rdx) ; SCALAR-NEXT: movb %al, 12(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r12b, 11(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SCALAR-NEXT: movb %sil, 11(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload ; SCALAR-NEXT: movb %bl, 10(%rdx) -; SCALAR-NEXT: movb %r10b, 9(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 8(%rdx) -; SCALAR-NEXT: movb %r9b, 7(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r14b, 9(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload +; SCALAR-NEXT: movb %bpl, 8(%rdx) +; SCALAR-NEXT: movb %r11b, 7(%rdx) ; SCALAR-NEXT: movb %r13b, 6(%rdx) -; SCALAR-NEXT: movb %r8b, 5(%rdx) -; SCALAR-NEXT: movb %cl, 4(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r10b, 5(%rdx) +; SCALAR-NEXT: movb %r12b, 4(%rdx) ; SCALAR-NEXT: movb %r9b, 3(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r8b, 2(%rdx) -; SCALAR-NEXT: movb %dil, 1(%rdx) -; SCALAR-NEXT: movb %r11b, (%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 63(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 62(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 61(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 60(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 59(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 58(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 57(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 56(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 55(%rdx) -; SCALAR-NEXT: movb %sil, 54(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 53(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 52(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 51(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 50(%rdx) -; SCALAR-NEXT: movb %r15b, 49(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 48(%rdx) -; SCALAR-NEXT: movb %r14b, 47(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 46(%rdx) -; SCALAR-NEXT: movb %bpl, 45(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SCALAR-NEXT: movb %cl, 44(%rdx) -; SCALAR-NEXT: movb %r12b, 43(%rdx) -; SCALAR-NEXT: movb %bl, 42(%rdx) -; SCALAR-NEXT: movb %r10b, 41(%rdx) -; SCALAR-NEXT: movb %al, 40(%rdx) +; SCALAR-NEXT: movb %r15b, 2(%rdx) +; SCALAR-NEXT: movb %r8b, 1(%rdx) +; SCALAR-NEXT: movb %dil, (%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 39(%rdx) -; SCALAR-NEXT: movb %r13b, 38(%rdx) +; SCALAR-NEXT: movb %al, 63(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 37(%rdx) +; SCALAR-NEXT: movb %al, 62(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 36(%rdx) +; SCALAR-NEXT: movb %al, 61(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 60(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 59(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 58(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 57(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 56(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 55(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 54(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 53(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 52(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 51(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 50(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 49(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 48(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 47(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 46(%rdx) +; SCALAR-NEXT: movb %cl, 45(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 44(%rdx) +; SCALAR-NEXT: movb %sil, 43(%rdx) +; SCALAR-NEXT: movb %bl, 42(%rdx) +; SCALAR-NEXT: movb %r14b, 41(%rdx) +; SCALAR-NEXT: movb %bpl, 40(%rdx) +; SCALAR-NEXT: movb %r11b, 39(%rdx) +; SCALAR-NEXT: movb %r13b, 38(%rdx) +; SCALAR-NEXT: movb %r10b, 37(%rdx) +; SCALAR-NEXT: movb %r12b, 36(%rdx) ; SCALAR-NEXT: movb %r9b, 35(%rdx) -; SCALAR-NEXT: movb %r8b, 34(%rdx) -; SCALAR-NEXT: movb %dil, 33(%rdx) -; SCALAR-NEXT: movb %r11b, 32(%rdx) +; SCALAR-NEXT: movb %r15b, 34(%rdx) +; SCALAR-NEXT: movb %r8b, 33(%rdx) +; SCALAR-NEXT: movb %dil, 32(%rdx) ; SCALAR-NEXT: popq %rbx ; SCALAR-NEXT: popq %r12 ; SCALAR-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll index c8aa7cf8c8f29..b1194bedc4e1c 100644 --- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll @@ -1198,14 +1198,6 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movq %rdx, %r8 ; CHECK-BASELINE-NEXT: movq %rsi, %r9 ; CHECK-BASELINE-NEXT: movq %rdi, %r11 -; CHECK-BASELINE-NEXT: movzbl 19(%rdx), %eax -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 18(%rdx), %eax -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 17(%rdx), %eax -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 16(%rdx), %eax -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 15(%rdx), %eax ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 14(%rdx), %eax @@ -1314,26 +1306,26 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: andb 15(%r10), %al ; CHECK-BASELINE-NEXT: xorb %cl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 16(%r9), %eax -; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: xorb %cl, %al -; CHECK-BASELINE-NEXT: andb 16(%r10), %al -; CHECK-BASELINE-NEXT: xorb %cl, %al -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 16(%r8), %eax +; CHECK-BASELINE-NEXT: movzbl 16(%r9), %ecx +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: andb 16(%r10), %cl +; CHECK-BASELINE-NEXT: xorb %al, %cl +; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 17(%r8), %eax ; CHECK-BASELINE-NEXT: movzbl 17(%r9), %ecx -; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: andb 17(%r10), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 18(%r8), %eax ; CHECK-BASELINE-NEXT: movzbl 18(%r9), %ecx -; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: andb 18(%r10), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 19(%r8), %eax ; CHECK-BASELINE-NEXT: movzbl 19(%r9), %ecx -; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: andb 19(%r10), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl @@ -1473,14 +1465,6 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: movq %rdx, %r8 ; CHECK-SSE1-NEXT: movq %rsi, %r9 ; CHECK-SSE1-NEXT: movq %rdi, %r11 -; CHECK-SSE1-NEXT: movzbl 19(%rdx), %eax -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 18(%rdx), %eax -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 17(%rdx), %eax -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 16(%rdx), %eax -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 15(%rdx), %eax ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 14(%rdx), %eax @@ -1589,26 +1573,26 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: andb 15(%r10), %al ; CHECK-SSE1-NEXT: xorb %cl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 16(%r9), %eax -; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; CHECK-SSE1-NEXT: xorb %cl, %al -; CHECK-SSE1-NEXT: andb 16(%r10), %al -; CHECK-SSE1-NEXT: xorb %cl, %al -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 16(%r8), %eax +; CHECK-SSE1-NEXT: movzbl 16(%r9), %ecx +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: andb 16(%r10), %cl +; CHECK-SSE1-NEXT: xorb %al, %cl +; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 17(%r8), %eax ; CHECK-SSE1-NEXT: movzbl 17(%r9), %ecx -; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: andb 17(%r10), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 18(%r8), %eax ; CHECK-SSE1-NEXT: movzbl 18(%r9), %ecx -; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: andb 18(%r10), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 19(%r8), %eax ; CHECK-SSE1-NEXT: movzbl 19(%r9), %ecx -; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: andb 19(%r10), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl @@ -3247,18 +3231,10 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: pushq %r13 ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx -; CHECK-BASELINE-NEXT: movq %rcx, %r15 -; CHECK-BASELINE-NEXT: movq %rdx, %rbx +; CHECK-BASELINE-NEXT: movq %rcx, %r12 +; CHECK-BASELINE-NEXT: movq %rdx, %r15 ; CHECK-BASELINE-NEXT: movq %rsi, %r14 -; CHECK-BASELINE-NEXT: movq %rdi, %r12 -; CHECK-BASELINE-NEXT: movzbl 19(%rdx), %eax -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 18(%rdx), %eax -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 17(%rdx), %eax -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 16(%rdx), %eax -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movq %rdi, %r13 ; CHECK-BASELINE-NEXT: movzbl 15(%rdx), %eax ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 14(%rdx), %eax @@ -3271,241 +3247,241 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 10(%rdx), %eax ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %r13d -; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %r10d -; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r9d -; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %r8d -; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %ebp +; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %r8d +; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %r9d +; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r10d +; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %ebp +; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %edi ; CHECK-BASELINE-NEXT: movzbl 4(%rdx), %esi -; CHECK-BASELINE-NEXT: movzbl 3(%rdx), %edx -; CHECK-BASELINE-NEXT: movzbl 2(%rbx), %eax -; CHECK-BASELINE-NEXT: movzbl (%rbx), %edi -; CHECK-BASELINE-NEXT: movzbl 1(%rbx), %ecx -; CHECK-BASELINE-NEXT: movzbl (%r14), %r11d -; CHECK-BASELINE-NEXT: xorb %dil, %r11b -; CHECK-BASELINE-NEXT: andb (%r15), %r11b -; CHECK-BASELINE-NEXT: xorb %dil, %r11b +; CHECK-BASELINE-NEXT: movzbl 3(%rdx), %eax +; CHECK-BASELINE-NEXT: movzbl 2(%rdx), %ecx +; CHECK-BASELINE-NEXT: movzbl (%rdx), %r11d +; CHECK-BASELINE-NEXT: movzbl 1(%rdx), %edx +; CHECK-BASELINE-NEXT: movzbl (%r14), %ebx +; CHECK-BASELINE-NEXT: xorb %r11b, %bl +; CHECK-BASELINE-NEXT: andb (%r12), %bl +; CHECK-BASELINE-NEXT: xorb %r11b, %bl +; CHECK-BASELINE-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 1(%r14), %r11d +; CHECK-BASELINE-NEXT: xorb %dl, %r11b +; CHECK-BASELINE-NEXT: andb 1(%r12), %r11b +; CHECK-BASELINE-NEXT: xorb %dl, %r11b ; CHECK-BASELINE-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 1(%r14), %edi -; CHECK-BASELINE-NEXT: xorb %cl, %dil -; CHECK-BASELINE-NEXT: andb 1(%r15), %dil -; CHECK-BASELINE-NEXT: xorb %cl, %dil -; CHECK-BASELINE-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 2(%r14), %ecx +; CHECK-BASELINE-NEXT: movzbl 2(%r14), %edx +; CHECK-BASELINE-NEXT: xorb %cl, %dl +; CHECK-BASELINE-NEXT: andb 2(%r12), %dl +; CHECK-BASELINE-NEXT: xorb %cl, %dl +; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 3(%r14), %ecx ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 2(%r15), %cl +; CHECK-BASELINE-NEXT: andb 3(%r12), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 3(%r14), %eax -; CHECK-BASELINE-NEXT: xorb %dl, %al -; CHECK-BASELINE-NEXT: andb 3(%r15), %al -; CHECK-BASELINE-NEXT: xorb %dl, %al -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 4(%r14), %eax ; CHECK-BASELINE-NEXT: xorb %sil, %al -; CHECK-BASELINE-NEXT: andb 4(%r15), %al +; CHECK-BASELINE-NEXT: andb 4(%r12), %al ; CHECK-BASELINE-NEXT: xorb %sil, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 5(%r14), %eax -; CHECK-BASELINE-NEXT: xorb %bpl, %al -; CHECK-BASELINE-NEXT: andb 5(%r15), %al -; CHECK-BASELINE-NEXT: xorb %bpl, %al +; CHECK-BASELINE-NEXT: xorb %dil, %al +; CHECK-BASELINE-NEXT: andb 5(%r12), %al +; CHECK-BASELINE-NEXT: xorb %dil, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 6(%r14), %eax -; CHECK-BASELINE-NEXT: xorb %r8b, %al -; CHECK-BASELINE-NEXT: andb 6(%r15), %al -; CHECK-BASELINE-NEXT: xorb %r8b, %al +; CHECK-BASELINE-NEXT: xorb %bpl, %al +; CHECK-BASELINE-NEXT: andb 6(%r12), %al +; CHECK-BASELINE-NEXT: xorb %bpl, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 7(%r14), %eax -; CHECK-BASELINE-NEXT: xorb %r9b, %al -; CHECK-BASELINE-NEXT: andb 7(%r15), %al -; CHECK-BASELINE-NEXT: xorb %r9b, %al -; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 8(%r14), %eax ; CHECK-BASELINE-NEXT: xorb %r10b, %al -; CHECK-BASELINE-NEXT: andb 8(%r15), %al +; CHECK-BASELINE-NEXT: andb 7(%r12), %al ; CHECK-BASELINE-NEXT: xorb %r10b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 8(%r14), %eax +; CHECK-BASELINE-NEXT: xorb %r9b, %al +; CHECK-BASELINE-NEXT: andb 8(%r12), %al +; CHECK-BASELINE-NEXT: xorb %r9b, %al +; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 9(%r14), %eax -; CHECK-BASELINE-NEXT: xorb %r13b, %al -; CHECK-BASELINE-NEXT: andb 9(%r15), %al -; CHECK-BASELINE-NEXT: xorb %r13b, %al +; CHECK-BASELINE-NEXT: xorb %r8b, %al +; CHECK-BASELINE-NEXT: andb 9(%r12), %al +; CHECK-BASELINE-NEXT: xorb %r8b, %al ; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 10(%r14), %ecx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 10(%r15), %cl +; CHECK-BASELINE-NEXT: andb 10(%r12), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 11(%r14), %ecx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 11(%r15), %cl +; CHECK-BASELINE-NEXT: andb 11(%r12), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 12(%r14), %ecx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 12(%r15), %cl +; CHECK-BASELINE-NEXT: andb 12(%r12), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 13(%r14), %ecx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 13(%r15), %cl +; CHECK-BASELINE-NEXT: andb 13(%r12), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 14(%r14), %ecx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 14(%r15), %cl +; CHECK-BASELINE-NEXT: andb 14(%r12), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-BASELINE-NEXT: movzbl 15(%r14), %ecx ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 15(%r15), %cl +; CHECK-BASELINE-NEXT: andb 15(%r12), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 16(%r15), %eax ; CHECK-BASELINE-NEXT: movzbl 16(%r14), %ecx -; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 16(%r15), %cl +; CHECK-BASELINE-NEXT: andb 16(%r12), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 17(%r15), %eax ; CHECK-BASELINE-NEXT: movzbl 17(%r14), %ecx -; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 17(%r15), %cl +; CHECK-BASELINE-NEXT: andb 17(%r12), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 18(%r15), %eax ; CHECK-BASELINE-NEXT: movzbl 18(%r14), %ecx -; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 18(%r15), %cl +; CHECK-BASELINE-NEXT: andb 18(%r12), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-BASELINE-NEXT: movzbl 19(%r15), %eax ; CHECK-BASELINE-NEXT: movzbl 19(%r14), %ecx -; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 19(%r15), %cl +; CHECK-BASELINE-NEXT: andb 19(%r12), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 20(%rbx), %eax +; CHECK-BASELINE-NEXT: movzbl 20(%r15), %eax ; CHECK-BASELINE-NEXT: movzbl 20(%r14), %ecx ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 20(%r15), %cl +; CHECK-BASELINE-NEXT: andb 20(%r12), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl ; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-BASELINE-NEXT: movzbl 21(%rbx), %eax -; CHECK-BASELINE-NEXT: movzbl 21(%r14), %r13d -; CHECK-BASELINE-NEXT: xorb %al, %r13b -; CHECK-BASELINE-NEXT: andb 21(%r15), %r13b -; CHECK-BASELINE-NEXT: xorb %al, %r13b -; CHECK-BASELINE-NEXT: movzbl 22(%rbx), %eax -; CHECK-BASELINE-NEXT: movzbl 22(%r14), %ebp +; CHECK-BASELINE-NEXT: movzbl 21(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 21(%r14), %ebp ; CHECK-BASELINE-NEXT: xorb %al, %bpl -; CHECK-BASELINE-NEXT: andb 22(%r15), %bpl +; CHECK-BASELINE-NEXT: andb 21(%r12), %bpl ; CHECK-BASELINE-NEXT: xorb %al, %bpl -; CHECK-BASELINE-NEXT: movzbl 23(%rbx), %eax +; CHECK-BASELINE-NEXT: movzbl 22(%r15), %eax +; CHECK-BASELINE-NEXT: movzbl 22(%r14), %ebx +; CHECK-BASELINE-NEXT: xorb %al, %bl +; CHECK-BASELINE-NEXT: andb 22(%r12), %bl +; CHECK-BASELINE-NEXT: xorb %al, %bl +; CHECK-BASELINE-NEXT: movzbl 23(%r15), %eax ; CHECK-BASELINE-NEXT: movzbl 23(%r14), %r11d ; CHECK-BASELINE-NEXT: xorb %al, %r11b -; CHECK-BASELINE-NEXT: andb 23(%r15), %r11b +; CHECK-BASELINE-NEXT: andb 23(%r12), %r11b ; CHECK-BASELINE-NEXT: xorb %al, %r11b -; CHECK-BASELINE-NEXT: movzbl 24(%rbx), %eax +; CHECK-BASELINE-NEXT: movzbl 24(%r15), %eax ; CHECK-BASELINE-NEXT: movzbl 24(%r14), %r9d ; CHECK-BASELINE-NEXT: xorb %al, %r9b -; CHECK-BASELINE-NEXT: andb 24(%r15), %r9b +; CHECK-BASELINE-NEXT: andb 24(%r12), %r9b ; CHECK-BASELINE-NEXT: xorb %al, %r9b -; CHECK-BASELINE-NEXT: movzbl 25(%rbx), %eax +; CHECK-BASELINE-NEXT: movzbl 25(%r15), %eax ; CHECK-BASELINE-NEXT: movzbl 25(%r14), %r8d ; CHECK-BASELINE-NEXT: xorb %al, %r8b -; CHECK-BASELINE-NEXT: andb 25(%r15), %r8b +; CHECK-BASELINE-NEXT: andb 25(%r12), %r8b ; CHECK-BASELINE-NEXT: xorb %al, %r8b -; CHECK-BASELINE-NEXT: movzbl 26(%rbx), %eax +; CHECK-BASELINE-NEXT: movzbl 26(%r15), %eax ; CHECK-BASELINE-NEXT: movzbl 26(%r14), %edi ; CHECK-BASELINE-NEXT: xorb %al, %dil -; CHECK-BASELINE-NEXT: andb 26(%r15), %dil +; CHECK-BASELINE-NEXT: andb 26(%r12), %dil ; CHECK-BASELINE-NEXT: xorb %al, %dil -; CHECK-BASELINE-NEXT: movzbl 27(%rbx), %eax +; CHECK-BASELINE-NEXT: movzbl 27(%r15), %eax ; CHECK-BASELINE-NEXT: movzbl 27(%r14), %esi ; CHECK-BASELINE-NEXT: xorb %al, %sil -; CHECK-BASELINE-NEXT: andb 27(%r15), %sil +; CHECK-BASELINE-NEXT: andb 27(%r12), %sil ; CHECK-BASELINE-NEXT: xorb %al, %sil -; CHECK-BASELINE-NEXT: movzbl 28(%rbx), %eax +; CHECK-BASELINE-NEXT: movzbl 28(%r15), %eax ; CHECK-BASELINE-NEXT: movzbl 28(%r14), %edx ; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: andb 28(%r15), %dl +; CHECK-BASELINE-NEXT: andb 28(%r12), %dl ; CHECK-BASELINE-NEXT: xorb %al, %dl -; CHECK-BASELINE-NEXT: movzbl 29(%rbx), %eax +; CHECK-BASELINE-NEXT: movzbl 29(%r15), %eax ; CHECK-BASELINE-NEXT: movzbl 29(%r14), %ecx ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: andb 29(%r15), %cl +; CHECK-BASELINE-NEXT: andb 29(%r12), %cl ; CHECK-BASELINE-NEXT: xorb %al, %cl -; CHECK-BASELINE-NEXT: movzbl 30(%rbx), %r10d +; CHECK-BASELINE-NEXT: movzbl 30(%r15), %r10d ; CHECK-BASELINE-NEXT: movzbl 30(%r14), %eax ; CHECK-BASELINE-NEXT: xorb %r10b, %al -; CHECK-BASELINE-NEXT: andb 30(%r15), %al +; CHECK-BASELINE-NEXT: andb 30(%r12), %al ; CHECK-BASELINE-NEXT: xorb %r10b, %al -; CHECK-BASELINE-NEXT: movzbl 31(%rbx), %r10d -; CHECK-BASELINE-NEXT: movzbl 31(%r14), %ebx -; CHECK-BASELINE-NEXT: xorb %r10b, %bl -; CHECK-BASELINE-NEXT: andb 31(%r15), %bl -; CHECK-BASELINE-NEXT: xorb %r10b, %bl -; CHECK-BASELINE-NEXT: movb %bl, 31(%r12) -; CHECK-BASELINE-NEXT: movb %al, 30(%r12) -; CHECK-BASELINE-NEXT: movb %cl, 29(%r12) -; CHECK-BASELINE-NEXT: movb %dl, 28(%r12) -; CHECK-BASELINE-NEXT: movb %sil, 27(%r12) -; CHECK-BASELINE-NEXT: movb %dil, 26(%r12) -; CHECK-BASELINE-NEXT: movb %r8b, 25(%r12) -; CHECK-BASELINE-NEXT: movb %r9b, 24(%r12) -; CHECK-BASELINE-NEXT: movb %r11b, 23(%r12) -; CHECK-BASELINE-NEXT: movb %bpl, 22(%r12) -; CHECK-BASELINE-NEXT: movb %r13b, 21(%r12) +; CHECK-BASELINE-NEXT: movzbl 31(%r15), %r10d +; CHECK-BASELINE-NEXT: movzbl 31(%r14), %r14d +; CHECK-BASELINE-NEXT: xorb %r10b, %r14b +; CHECK-BASELINE-NEXT: andb 31(%r12), %r14b +; CHECK-BASELINE-NEXT: xorb %r10b, %r14b +; CHECK-BASELINE-NEXT: movb %r14b, 31(%r13) +; CHECK-BASELINE-NEXT: movb %al, 30(%r13) +; CHECK-BASELINE-NEXT: movb %cl, 29(%r13) +; CHECK-BASELINE-NEXT: movb %dl, 28(%r13) +; CHECK-BASELINE-NEXT: movb %sil, 27(%r13) +; CHECK-BASELINE-NEXT: movb %dil, 26(%r13) +; CHECK-BASELINE-NEXT: movb %r8b, 25(%r13) +; CHECK-BASELINE-NEXT: movb %r9b, 24(%r13) +; CHECK-BASELINE-NEXT: movb %r11b, 23(%r13) +; CHECK-BASELINE-NEXT: movb %bl, 22(%r13) +; CHECK-BASELINE-NEXT: movb %bpl, 21(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 20(%r12) +; CHECK-BASELINE-NEXT: movb %al, 20(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 19(%r12) +; CHECK-BASELINE-NEXT: movb %al, 19(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 18(%r12) +; CHECK-BASELINE-NEXT: movb %al, 18(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 17(%r12) +; CHECK-BASELINE-NEXT: movb %al, 17(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 16(%r12) +; CHECK-BASELINE-NEXT: movb %al, 16(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 15(%r12) +; CHECK-BASELINE-NEXT: movb %al, 15(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 14(%r12) +; CHECK-BASELINE-NEXT: movb %al, 14(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 13(%r12) +; CHECK-BASELINE-NEXT: movb %al, 13(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 12(%r12) +; CHECK-BASELINE-NEXT: movb %al, 12(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 11(%r12) +; CHECK-BASELINE-NEXT: movb %al, 11(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 10(%r12) +; CHECK-BASELINE-NEXT: movb %al, 10(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 9(%r12) +; CHECK-BASELINE-NEXT: movb %al, 9(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 8(%r12) +; CHECK-BASELINE-NEXT: movb %al, 8(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 7(%r12) +; CHECK-BASELINE-NEXT: movb %al, 7(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 6(%r12) +; CHECK-BASELINE-NEXT: movb %al, 6(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 5(%r12) +; CHECK-BASELINE-NEXT: movb %al, 5(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 4(%r12) +; CHECK-BASELINE-NEXT: movb %al, 4(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 3(%r12) +; CHECK-BASELINE-NEXT: movb %al, 3(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 2(%r12) +; CHECK-BASELINE-NEXT: movb %al, 2(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, 1(%r12) +; CHECK-BASELINE-NEXT: movb %al, 1(%r13) ; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-BASELINE-NEXT: movb %al, (%r12) -; CHECK-BASELINE-NEXT: movq %r12, %rax +; CHECK-BASELINE-NEXT: movb %al, (%r13) +; CHECK-BASELINE-NEXT: movq %r13, %rax ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: popq %r12 ; CHECK-BASELINE-NEXT: popq %r13 @@ -3522,18 +3498,10 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: pushq %r13 ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx -; CHECK-SSE1-NEXT: movq %rcx, %r15 -; CHECK-SSE1-NEXT: movq %rdx, %rbx +; CHECK-SSE1-NEXT: movq %rcx, %r12 +; CHECK-SSE1-NEXT: movq %rdx, %r15 ; CHECK-SSE1-NEXT: movq %rsi, %r14 -; CHECK-SSE1-NEXT: movq %rdi, %r12 -; CHECK-SSE1-NEXT: movzbl 19(%rdx), %eax -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 18(%rdx), %eax -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 17(%rdx), %eax -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 16(%rdx), %eax -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movq %rdi, %r13 ; CHECK-SSE1-NEXT: movzbl 15(%rdx), %eax ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 14(%rdx), %eax @@ -3546,241 +3514,241 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 10(%rdx), %eax ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 9(%rdx), %r13d -; CHECK-SSE1-NEXT: movzbl 8(%rdx), %r10d -; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r9d -; CHECK-SSE1-NEXT: movzbl 6(%rdx), %r8d -; CHECK-SSE1-NEXT: movzbl 5(%rdx), %ebp +; CHECK-SSE1-NEXT: movzbl 9(%rdx), %r8d +; CHECK-SSE1-NEXT: movzbl 8(%rdx), %r9d +; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r10d +; CHECK-SSE1-NEXT: movzbl 6(%rdx), %ebp +; CHECK-SSE1-NEXT: movzbl 5(%rdx), %edi ; CHECK-SSE1-NEXT: movzbl 4(%rdx), %esi -; CHECK-SSE1-NEXT: movzbl 3(%rdx), %edx -; CHECK-SSE1-NEXT: movzbl 2(%rbx), %eax -; CHECK-SSE1-NEXT: movzbl (%rbx), %edi -; CHECK-SSE1-NEXT: movzbl 1(%rbx), %ecx -; CHECK-SSE1-NEXT: movzbl (%r14), %r11d -; CHECK-SSE1-NEXT: xorb %dil, %r11b -; CHECK-SSE1-NEXT: andb (%r15), %r11b -; CHECK-SSE1-NEXT: xorb %dil, %r11b +; CHECK-SSE1-NEXT: movzbl 3(%rdx), %eax +; CHECK-SSE1-NEXT: movzbl 2(%rdx), %ecx +; CHECK-SSE1-NEXT: movzbl (%rdx), %r11d +; CHECK-SSE1-NEXT: movzbl 1(%rdx), %edx +; CHECK-SSE1-NEXT: movzbl (%r14), %ebx +; CHECK-SSE1-NEXT: xorb %r11b, %bl +; CHECK-SSE1-NEXT: andb (%r12), %bl +; CHECK-SSE1-NEXT: xorb %r11b, %bl +; CHECK-SSE1-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 1(%r14), %r11d +; CHECK-SSE1-NEXT: xorb %dl, %r11b +; CHECK-SSE1-NEXT: andb 1(%r12), %r11b +; CHECK-SSE1-NEXT: xorb %dl, %r11b ; CHECK-SSE1-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 1(%r14), %edi -; CHECK-SSE1-NEXT: xorb %cl, %dil -; CHECK-SSE1-NEXT: andb 1(%r15), %dil -; CHECK-SSE1-NEXT: xorb %cl, %dil -; CHECK-SSE1-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 2(%r14), %ecx +; CHECK-SSE1-NEXT: movzbl 2(%r14), %edx +; CHECK-SSE1-NEXT: xorb %cl, %dl +; CHECK-SSE1-NEXT: andb 2(%r12), %dl +; CHECK-SSE1-NEXT: xorb %cl, %dl +; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 3(%r14), %ecx ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 2(%r15), %cl +; CHECK-SSE1-NEXT: andb 3(%r12), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 3(%r14), %eax -; CHECK-SSE1-NEXT: xorb %dl, %al -; CHECK-SSE1-NEXT: andb 3(%r15), %al -; CHECK-SSE1-NEXT: xorb %dl, %al -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 4(%r14), %eax ; CHECK-SSE1-NEXT: xorb %sil, %al -; CHECK-SSE1-NEXT: andb 4(%r15), %al +; CHECK-SSE1-NEXT: andb 4(%r12), %al ; CHECK-SSE1-NEXT: xorb %sil, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 5(%r14), %eax -; CHECK-SSE1-NEXT: xorb %bpl, %al -; CHECK-SSE1-NEXT: andb 5(%r15), %al -; CHECK-SSE1-NEXT: xorb %bpl, %al +; CHECK-SSE1-NEXT: xorb %dil, %al +; CHECK-SSE1-NEXT: andb 5(%r12), %al +; CHECK-SSE1-NEXT: xorb %dil, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 6(%r14), %eax -; CHECK-SSE1-NEXT: xorb %r8b, %al -; CHECK-SSE1-NEXT: andb 6(%r15), %al -; CHECK-SSE1-NEXT: xorb %r8b, %al +; CHECK-SSE1-NEXT: xorb %bpl, %al +; CHECK-SSE1-NEXT: andb 6(%r12), %al +; CHECK-SSE1-NEXT: xorb %bpl, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 7(%r14), %eax -; CHECK-SSE1-NEXT: xorb %r9b, %al -; CHECK-SSE1-NEXT: andb 7(%r15), %al -; CHECK-SSE1-NEXT: xorb %r9b, %al -; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 8(%r14), %eax ; CHECK-SSE1-NEXT: xorb %r10b, %al -; CHECK-SSE1-NEXT: andb 8(%r15), %al +; CHECK-SSE1-NEXT: andb 7(%r12), %al ; CHECK-SSE1-NEXT: xorb %r10b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 8(%r14), %eax +; CHECK-SSE1-NEXT: xorb %r9b, %al +; CHECK-SSE1-NEXT: andb 8(%r12), %al +; CHECK-SSE1-NEXT: xorb %r9b, %al +; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 9(%r14), %eax -; CHECK-SSE1-NEXT: xorb %r13b, %al -; CHECK-SSE1-NEXT: andb 9(%r15), %al -; CHECK-SSE1-NEXT: xorb %r13b, %al +; CHECK-SSE1-NEXT: xorb %r8b, %al +; CHECK-SSE1-NEXT: andb 9(%r12), %al +; CHECK-SSE1-NEXT: xorb %r8b, %al ; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 10(%r14), %ecx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 10(%r15), %cl +; CHECK-SSE1-NEXT: andb 10(%r12), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 11(%r14), %ecx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 11(%r15), %cl +; CHECK-SSE1-NEXT: andb 11(%r12), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 12(%r14), %ecx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 12(%r15), %cl +; CHECK-SSE1-NEXT: andb 12(%r12), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 13(%r14), %ecx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 13(%r15), %cl +; CHECK-SSE1-NEXT: andb 13(%r12), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 14(%r14), %ecx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 14(%r15), %cl +; CHECK-SSE1-NEXT: andb 14(%r12), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; CHECK-SSE1-NEXT: movzbl 15(%r14), %ecx ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 15(%r15), %cl +; CHECK-SSE1-NEXT: andb 15(%r12), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 16(%r15), %eax ; CHECK-SSE1-NEXT: movzbl 16(%r14), %ecx -; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 16(%r15), %cl +; CHECK-SSE1-NEXT: andb 16(%r12), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 17(%r15), %eax ; CHECK-SSE1-NEXT: movzbl 17(%r14), %ecx -; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 17(%r15), %cl +; CHECK-SSE1-NEXT: andb 17(%r12), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 18(%r15), %eax ; CHECK-SSE1-NEXT: movzbl 18(%r14), %ecx -; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 18(%r15), %cl +; CHECK-SSE1-NEXT: andb 18(%r12), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-SSE1-NEXT: movzbl 19(%r15), %eax ; CHECK-SSE1-NEXT: movzbl 19(%r14), %ecx -; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 19(%r15), %cl +; CHECK-SSE1-NEXT: andb 19(%r12), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 20(%rbx), %eax +; CHECK-SSE1-NEXT: movzbl 20(%r15), %eax ; CHECK-SSE1-NEXT: movzbl 20(%r14), %ecx ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 20(%r15), %cl +; CHECK-SSE1-NEXT: andb 20(%r12), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl ; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; CHECK-SSE1-NEXT: movzbl 21(%rbx), %eax -; CHECK-SSE1-NEXT: movzbl 21(%r14), %r13d -; CHECK-SSE1-NEXT: xorb %al, %r13b -; CHECK-SSE1-NEXT: andb 21(%r15), %r13b -; CHECK-SSE1-NEXT: xorb %al, %r13b -; CHECK-SSE1-NEXT: movzbl 22(%rbx), %eax -; CHECK-SSE1-NEXT: movzbl 22(%r14), %ebp +; CHECK-SSE1-NEXT: movzbl 21(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 21(%r14), %ebp ; CHECK-SSE1-NEXT: xorb %al, %bpl -; CHECK-SSE1-NEXT: andb 22(%r15), %bpl +; CHECK-SSE1-NEXT: andb 21(%r12), %bpl ; CHECK-SSE1-NEXT: xorb %al, %bpl -; CHECK-SSE1-NEXT: movzbl 23(%rbx), %eax +; CHECK-SSE1-NEXT: movzbl 22(%r15), %eax +; CHECK-SSE1-NEXT: movzbl 22(%r14), %ebx +; CHECK-SSE1-NEXT: xorb %al, %bl +; CHECK-SSE1-NEXT: andb 22(%r12), %bl +; CHECK-SSE1-NEXT: xorb %al, %bl +; CHECK-SSE1-NEXT: movzbl 23(%r15), %eax ; CHECK-SSE1-NEXT: movzbl 23(%r14), %r11d ; CHECK-SSE1-NEXT: xorb %al, %r11b -; CHECK-SSE1-NEXT: andb 23(%r15), %r11b +; CHECK-SSE1-NEXT: andb 23(%r12), %r11b ; CHECK-SSE1-NEXT: xorb %al, %r11b -; CHECK-SSE1-NEXT: movzbl 24(%rbx), %eax +; CHECK-SSE1-NEXT: movzbl 24(%r15), %eax ; CHECK-SSE1-NEXT: movzbl 24(%r14), %r9d ; CHECK-SSE1-NEXT: xorb %al, %r9b -; CHECK-SSE1-NEXT: andb 24(%r15), %r9b +; CHECK-SSE1-NEXT: andb 24(%r12), %r9b ; CHECK-SSE1-NEXT: xorb %al, %r9b -; CHECK-SSE1-NEXT: movzbl 25(%rbx), %eax +; CHECK-SSE1-NEXT: movzbl 25(%r15), %eax ; CHECK-SSE1-NEXT: movzbl 25(%r14), %r8d ; CHECK-SSE1-NEXT: xorb %al, %r8b -; CHECK-SSE1-NEXT: andb 25(%r15), %r8b +; CHECK-SSE1-NEXT: andb 25(%r12), %r8b ; CHECK-SSE1-NEXT: xorb %al, %r8b -; CHECK-SSE1-NEXT: movzbl 26(%rbx), %eax +; CHECK-SSE1-NEXT: movzbl 26(%r15), %eax ; CHECK-SSE1-NEXT: movzbl 26(%r14), %edi ; CHECK-SSE1-NEXT: xorb %al, %dil -; CHECK-SSE1-NEXT: andb 26(%r15), %dil +; CHECK-SSE1-NEXT: andb 26(%r12), %dil ; CHECK-SSE1-NEXT: xorb %al, %dil -; CHECK-SSE1-NEXT: movzbl 27(%rbx), %eax +; CHECK-SSE1-NEXT: movzbl 27(%r15), %eax ; CHECK-SSE1-NEXT: movzbl 27(%r14), %esi ; CHECK-SSE1-NEXT: xorb %al, %sil -; CHECK-SSE1-NEXT: andb 27(%r15), %sil +; CHECK-SSE1-NEXT: andb 27(%r12), %sil ; CHECK-SSE1-NEXT: xorb %al, %sil -; CHECK-SSE1-NEXT: movzbl 28(%rbx), %eax +; CHECK-SSE1-NEXT: movzbl 28(%r15), %eax ; CHECK-SSE1-NEXT: movzbl 28(%r14), %edx ; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: andb 28(%r15), %dl +; CHECK-SSE1-NEXT: andb 28(%r12), %dl ; CHECK-SSE1-NEXT: xorb %al, %dl -; CHECK-SSE1-NEXT: movzbl 29(%rbx), %eax +; CHECK-SSE1-NEXT: movzbl 29(%r15), %eax ; CHECK-SSE1-NEXT: movzbl 29(%r14), %ecx ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: andb 29(%r15), %cl +; CHECK-SSE1-NEXT: andb 29(%r12), %cl ; CHECK-SSE1-NEXT: xorb %al, %cl -; CHECK-SSE1-NEXT: movzbl 30(%rbx), %r10d +; CHECK-SSE1-NEXT: movzbl 30(%r15), %r10d ; CHECK-SSE1-NEXT: movzbl 30(%r14), %eax ; CHECK-SSE1-NEXT: xorb %r10b, %al -; CHECK-SSE1-NEXT: andb 30(%r15), %al +; CHECK-SSE1-NEXT: andb 30(%r12), %al ; CHECK-SSE1-NEXT: xorb %r10b, %al -; CHECK-SSE1-NEXT: movzbl 31(%rbx), %r10d -; CHECK-SSE1-NEXT: movzbl 31(%r14), %ebx -; CHECK-SSE1-NEXT: xorb %r10b, %bl -; CHECK-SSE1-NEXT: andb 31(%r15), %bl -; CHECK-SSE1-NEXT: xorb %r10b, %bl -; CHECK-SSE1-NEXT: movb %bl, 31(%r12) -; CHECK-SSE1-NEXT: movb %al, 30(%r12) -; CHECK-SSE1-NEXT: movb %cl, 29(%r12) -; CHECK-SSE1-NEXT: movb %dl, 28(%r12) -; CHECK-SSE1-NEXT: movb %sil, 27(%r12) -; CHECK-SSE1-NEXT: movb %dil, 26(%r12) -; CHECK-SSE1-NEXT: movb %r8b, 25(%r12) -; CHECK-SSE1-NEXT: movb %r9b, 24(%r12) -; CHECK-SSE1-NEXT: movb %r11b, 23(%r12) -; CHECK-SSE1-NEXT: movb %bpl, 22(%r12) -; CHECK-SSE1-NEXT: movb %r13b, 21(%r12) +; CHECK-SSE1-NEXT: movzbl 31(%r15), %r10d +; CHECK-SSE1-NEXT: movzbl 31(%r14), %r14d +; CHECK-SSE1-NEXT: xorb %r10b, %r14b +; CHECK-SSE1-NEXT: andb 31(%r12), %r14b +; CHECK-SSE1-NEXT: xorb %r10b, %r14b +; CHECK-SSE1-NEXT: movb %r14b, 31(%r13) +; CHECK-SSE1-NEXT: movb %al, 30(%r13) +; CHECK-SSE1-NEXT: movb %cl, 29(%r13) +; CHECK-SSE1-NEXT: movb %dl, 28(%r13) +; CHECK-SSE1-NEXT: movb %sil, 27(%r13) +; CHECK-SSE1-NEXT: movb %dil, 26(%r13) +; CHECK-SSE1-NEXT: movb %r8b, 25(%r13) +; CHECK-SSE1-NEXT: movb %r9b, 24(%r13) +; CHECK-SSE1-NEXT: movb %r11b, 23(%r13) +; CHECK-SSE1-NEXT: movb %bl, 22(%r13) +; CHECK-SSE1-NEXT: movb %bpl, 21(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 20(%r12) +; CHECK-SSE1-NEXT: movb %al, 20(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 19(%r12) +; CHECK-SSE1-NEXT: movb %al, 19(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 18(%r12) +; CHECK-SSE1-NEXT: movb %al, 18(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 17(%r12) +; CHECK-SSE1-NEXT: movb %al, 17(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 16(%r12) +; CHECK-SSE1-NEXT: movb %al, 16(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 15(%r12) +; CHECK-SSE1-NEXT: movb %al, 15(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 14(%r12) +; CHECK-SSE1-NEXT: movb %al, 14(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 13(%r12) +; CHECK-SSE1-NEXT: movb %al, 13(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 12(%r12) +; CHECK-SSE1-NEXT: movb %al, 12(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 11(%r12) +; CHECK-SSE1-NEXT: movb %al, 11(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 10(%r12) +; CHECK-SSE1-NEXT: movb %al, 10(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 9(%r12) +; CHECK-SSE1-NEXT: movb %al, 9(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 8(%r12) +; CHECK-SSE1-NEXT: movb %al, 8(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 7(%r12) +; CHECK-SSE1-NEXT: movb %al, 7(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 6(%r12) +; CHECK-SSE1-NEXT: movb %al, 6(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 5(%r12) +; CHECK-SSE1-NEXT: movb %al, 5(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 4(%r12) +; CHECK-SSE1-NEXT: movb %al, 4(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 3(%r12) +; CHECK-SSE1-NEXT: movb %al, 3(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 2(%r12) +; CHECK-SSE1-NEXT: movb %al, 2(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, 1(%r12) +; CHECK-SSE1-NEXT: movb %al, 1(%r13) ; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; CHECK-SSE1-NEXT: movb %al, (%r12) -; CHECK-SSE1-NEXT: movq %r12, %rax +; CHECK-SSE1-NEXT: movb %al, (%r13) +; CHECK-SSE1-NEXT: movq %r13, %rax ; CHECK-SSE1-NEXT: popq %rbx ; CHECK-SSE1-NEXT: popq %r12 ; CHECK-SSE1-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/x86-64-flags-intrinsics.ll b/llvm/test/CodeGen/X86/x86-64-flags-intrinsics.ll index d9dc117397b4a..e3bc77d4d5fa2 100644 --- a/llvm/test/CodeGen/X86/x86-64-flags-intrinsics.ll +++ b/llvm/test/CodeGen/X86/x86-64-flags-intrinsics.ll @@ -91,11 +91,11 @@ define i64 @read_flags_reg_pressure() nounwind { ; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rbp ; WIN64-NEXT: #APP ; WIN64-NEXT: #NO_APP -; WIN64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WIN64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; WIN64-NEXT: pushfq -; WIN64-NEXT: popq %rcx -; WIN64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; WIN64-NEXT: popq %rdx +; WIN64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; WIN64-NEXT: #APP ; WIN64-NEXT: #NO_APP ; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload @@ -168,11 +168,11 @@ define void @write_flags_reg_pressure(i64 noundef %0) nounwind { ; WIN64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; WIN64-NEXT: #APP ; WIN64-NEXT: #NO_APP -; WIN64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; WIN64-NEXT: pushq %rcx +; WIN64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; WIN64-NEXT: pushq %rdx ; WIN64-NEXT: popfq -; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; WIN64-NEXT: #APP ; WIN64-NEXT: #NO_APP ; WIN64-NEXT: addq $16, %rsp