diff --git a/llvm/lib/Target/AIE/AIEBaseInstrInfo.cpp b/llvm/lib/Target/AIE/AIEBaseInstrInfo.cpp index db8e2ea610d9..7871529bfd15 100644 --- a/llvm/lib/Target/AIE/AIEBaseInstrInfo.cpp +++ b/llvm/lib/Target/AIE/AIEBaseInstrInfo.cpp @@ -1046,20 +1046,57 @@ const PacketFormats &AIEBaseInstrInfo::getPacketFormats() const { return FormatInterface->getPacketFormats(); } +unsigned +AIEBaseInstrInfo::getZOLBundlesCount(const MachineBasicBlock &MBB) const { + auto First = MBB.getFirstNonDebugInstr(false); + auto Last = MBB.getLastNonDebugInstr(false); + + // If MBB is empty or has no non-debug instructions, return 0. + if (First == MBB.end() || Last == MBB.end()) + return 0; + + if (isHardwareLoopEnd(Last->getOpcode())) + return std::distance(First, Last); + + return 0; +} + +bool AIEBaseInstrInfo::isZOLBody(const MachineBasicBlock &MBB) const { + auto Last = MBB.getLastNonDebugInstr(false); + + // If MBB is empty or has no non-debug instructions, return false. + if (Last == MBB.end()) + return false; + + return isHardwareLoopEnd(Last->getOpcode()); +} + std::vector AIEBaseInstrInfo::getAlignmentBoundaries(MachineBasicBlock &MBB) const { std::vector AlgnCandidates; - unsigned DelaySlot = 0; + unsigned DelaySlot = 0; // LoopSetupDistance will be set to number of instructions (7). In // PostRAScheduler, this is enforced by setting the exit latency in the - // schduler dag mutator + // schduler dag mutator. unsigned LoopSetupDistance = 0; + unsigned ZOLBundlesCount = 0; + unsigned ZOLBodyRegionsCount = 0; bool IsCall = false; auto ZOLSupport = getZOLSupport(); + bool IsZOLBody = isZOLBody(MBB); + if (IsZOLBody) { + // Exclude the LoopEnd bundle since it must reside in its own + // standalone region to ensure it points to a 128-bit aligned instruction. + ZOLBundlesCount = getZOLBundlesCount(MBB) - 1; + if (ZOLBundlesCount < ZOLSupport->LoopSetupDistance) + ZOLBodyRegionsCount = ZOLBundlesCount; + else + ZOLBodyRegionsCount = ZOLSupport->LoopSetupDistance; + } for (auto MI = MBB.begin(), End = MBB.end(); MI != End; ++MI) { if (MI->isBundle()) { - // Return Address Candidate + // Return Address Candidate. IsCall = isCallBundle(MI); if (IsCall && DelaySlot > 0) llvm_unreachable("Cannot have branch in branch delay slot!\n"); @@ -1079,14 +1116,26 @@ AIEBaseInstrInfo::getAlignmentBoundaries(MachineBasicBlock &MBB) const { LoopSetupDistance--; } + if (IsZOLBody && ZOLBodyRegionsCount > 0) { + AlgnCandidates.emplace_back(MI); + ZOLBodyRegionsCount--; + } + if (IsCall) DelaySlot = getNumDelaySlots(*MI); // Distance in terms of fully-expanded 128-bit bundles that // loop setup should maintain. We force each of these bundles to an // alignment boundary, so that they will occupy 16 bytes. - if (ZOLSupport && isZOLSetupBundle(MI) && isLastZOLSetupBundleInMBB(MI)) - LoopSetupDistance = ZOLSupport->LoopSetupDistance; + if (ZOLSupport && isZOLSetupBundle(MI) && isLastZOLSetupBundleInMBB(MI)) { + // if we have only one MBB, it must be the loop. + if (MBB.succ_size() == 1) { + const MachineBasicBlock *LoopSucc = *MBB.successors().begin(); + ZOLBundlesCount = getZOLBundlesCount(*LoopSucc) - 1; + } + if (ZOLBundlesCount < ZOLSupport->LoopSetupDistance) + LoopSetupDistance = ZOLSupport->LoopSetupDistance - ZOLBundlesCount; + } } else if (isHardwareLoopEnd(MI->getOpcode())) { if (DelaySlot > 0) llvm_unreachable("Cannot have HWLoopEnd in branch delay slot!\n"); @@ -1095,7 +1144,7 @@ AIEBaseInstrInfo::getAlignmentBoundaries(MachineBasicBlock &MBB) const { AlgnCandidates.emplace_back(std::prev(MI)); } else if (!MI->isMetaInstruction()) { // single instruction, there should not be any - // after Bundle Finalization Pass + // after Bundle Finalization Pass. llvm_unreachable("Found an un-expected standalone instruction !"); } } diff --git a/llvm/lib/Target/AIE/AIEBaseInstrInfo.h b/llvm/lib/Target/AIE/AIEBaseInstrInfo.h index 9f832f02b489..34e0de74e7f6 100644 --- a/llvm/lib/Target/AIE/AIEBaseInstrInfo.h +++ b/llvm/lib/Target/AIE/AIEBaseInstrInfo.h @@ -31,7 +31,6 @@ namespace llvm { struct AIEBaseInstrInfo : public TargetInstrInfo { using TargetInstrInfo::TargetInstrInfo; - // This codifies the model of ZeroOverheadLoops class ZOLSupport { public: @@ -313,6 +312,10 @@ struct AIEBaseInstrInfo : public TargetInstrInfo { // registers(lc, le, ls, etc.) and the end of the loop, virtual unsigned getLoopSetupDistance() const; + virtual unsigned getZOLBundlesCount(const MachineBasicBlock &MBB) const; + + bool isZOLBody(const MachineBasicBlock &MBB) const; + // Return the vector of Alignment Region Boundaries. virtual std::vector getAlignmentBoundaries(MachineBasicBlock &MBB) const; diff --git a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp index eea659f02faa..b723bc62e8ed 100644 --- a/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp +++ b/llvm/lib/Target/AIE/AIEBaseSubtarget.cpp @@ -260,6 +260,8 @@ class RegionEndEdges : public ScheduleDAGMutation { } void apply(ScheduleDAGInstrs *DAG) override { AIE::MaxLatencyFinder MaxLatency(DAG); + MachineBasicBlock *PrologueMBB = DAG->getBB(); + unsigned int ZOLBundlesCount = 0; // Default edges to ExitSU are conservative, and can't be shrunk. // We really should know what we're doing here, so just remove and @@ -296,9 +298,18 @@ class RegionEndEdges : public ScheduleDAGMutation { if (TII->isZeroOverheadLoopSetupInstr(MI)) { auto ZOLSupport = TII->getZOLSupport(); assert(ZOLSupport); - EdgeLatency = std::max(EdgeLatency, ZOLSupport->LoopSetupDistance + 1); + if (PrologueMBB && PrologueMBB->succ_size() == 1) { + // if we have only one MBB, it must be the loop. + MachineBasicBlock *LoopSucc = *PrologueMBB->successors().begin(); + // Exclude the LoopEnd bundle since it must reside in its own + // standalone region to ensure it points to a 128-bit aligned + // instruction. + ZOLBundlesCount = TII->getZOLBundlesCount(*LoopSucc) - 1; + } + if (ZOLBundlesCount < ZOLSupport->LoopSetupDistance) + EdgeLatency = std::max(EdgeLatency, ZOLSupport->LoopSetupDistance + + 1 - ZOLBundlesCount); } - ExitDep.setLatency(EdgeLatency); DAG->ExitSU.addPred(ExitDep, /*Required=*/true); } diff --git a/llvm/test/CodeGen/AIE/aie2/elongate/zol_112bytes_elongate2.mir b/llvm/test/CodeGen/AIE/aie2/elongate/zol_112bytes_elongate2.mir index f40cb7bcb140..c0844bb17dc7 100644 --- a/llvm/test/CodeGen/AIE/aie2/elongate/zol_112bytes_elongate2.mir +++ b/llvm/test/CodeGen/AIE/aie2/elongate/zol_112bytes_elongate2.mir @@ -26,6 +26,7 @@ body: | ; CHECK-NEXT: NOPV ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE implicit-def $r3 { + ; CHECK-NEXT: NOPA ; CHECK-NEXT: NOPX ; CHECK-NEXT: renamable $r3 = MOV_mv_cg 2 ; CHECK-NEXT: } @@ -36,53 +37,25 @@ body: | ; CHECK-NEXT: $ls = MOVXM_lng_cg %bb.1 ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOPB - ; CHECK-NEXT: NOPA - ; CHECK-NEXT: NOPS - ; CHECK-NEXT: NOPXM - ; CHECK-NEXT: NOPV + ; CHECK-NEXT: NOP ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOPB - ; CHECK-NEXT: NOPA - ; CHECK-NEXT: NOPS - ; CHECK-NEXT: NOPXM - ; CHECK-NEXT: NOPV + ; CHECK-NEXT: NOP ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOPB - ; CHECK-NEXT: NOPA - ; CHECK-NEXT: NOPS - ; CHECK-NEXT: NOPXM - ; CHECK-NEXT: NOPV + ; CHECK-NEXT: NOP ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOPB - ; CHECK-NEXT: NOPA - ; CHECK-NEXT: NOPS - ; CHECK-NEXT: NOPXM - ; CHECK-NEXT: NOPV + ; CHECK-NEXT: NOP ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOPB - ; CHECK-NEXT: NOPA - ; CHECK-NEXT: NOPS - ; CHECK-NEXT: NOPXM - ; CHECK-NEXT: NOPV + ; CHECK-NEXT: NOP ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOPB - ; CHECK-NEXT: NOPA - ; CHECK-NEXT: NOPS - ; CHECK-NEXT: NOPXM - ; CHECK-NEXT: NOPV + ; CHECK-NEXT: NOP ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOPB - ; CHECK-NEXT: NOPA - ; CHECK-NEXT: NOPS - ; CHECK-NEXT: NOPXM - ; CHECK-NEXT: NOPV + ; CHECK-NEXT: NOP ; CHECK-NEXT: } ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1 (align 16): @@ -90,12 +63,19 @@ body: | ; CHECK-NEXT: liveins: $p0, $p1, $r0, $r1, $r2, $r3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUNDLE implicit-def $r4, implicit $r2, implicit $r3 { + ; CHECK-NEXT: NOPB ; CHECK-NEXT: NOPA + ; CHECK-NEXT: NOPS ; CHECK-NEXT: renamable $r4 = LSHL renamable $r2, renamable $r3 ; CHECK-NEXT: NOPM + ; CHECK-NEXT: NOPV ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE implicit-def $le { + ; CHECK-NEXT: NOPB + ; CHECK-NEXT: NOPA + ; CHECK-NEXT: NOPS ; CHECK-NEXT: $le = MOVXM_lng_cg + ; CHECK-NEXT: NOPV ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE implicit-def $dj0, implicit killed $r4 { ; CHECK-NEXT: NOPB @@ -127,18 +107,13 @@ body: | ; CHECK-NEXT: NOPV ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOPB ; CHECK-NEXT: NOPA - ; CHECK-NEXT: NOPS + ; CHECK-NEXT: NOPB ; CHECK-NEXT: NOPXM - ; CHECK-NEXT: NOPV + ; CHECK-NEXT: NOPS ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOPB - ; CHECK-NEXT: NOPA - ; CHECK-NEXT: NOPS - ; CHECK-NEXT: NOPXM - ; CHECK-NEXT: NOPV + ; CHECK-NEXT: NOP ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { ; CHECK-NEXT: NOPB diff --git a/llvm/test/CodeGen/AIE/aie2/elongate/zol_elongate1.mir b/llvm/test/CodeGen/AIE/aie2/elongate/zol_elongate1.mir index 84aa0553580e..a96a1c9a9dfb 100644 --- a/llvm/test/CodeGen/AIE/aie2/elongate/zol_elongate1.mir +++ b/llvm/test/CodeGen/AIE/aie2/elongate/zol_elongate1.mir @@ -69,22 +69,47 @@ body: | ; CHECK-NEXT: NOPV ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE implicit-def $dj0, implicit killed $r4 { + ; CHECK-NEXT: NOPB + ; CHECK-NEXT: NOPA + ; CHECK-NEXT: NOPS + ; CHECK-NEXT: NOPX ; CHECK-NEXT: $dj0 = MOV_mv_scl killed $r4 + ; CHECK-NEXT: NOPV ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE implicit-def $r4, implicit $p1, implicit killed $dj0 { + ; CHECK-NEXT: NOPB ; CHECK-NEXT: renamable $r4 = LDA_dms_lda_idx renamable $p1, killed renamable $dj0 :: (load (s32) from %ir.arrayidx) + ; CHECK-NEXT: NOPS + ; CHECK-NEXT: NOPXM + ; CHECK-NEXT: NOPV ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOPB + ; CHECK-NEXT: NOPA + ; CHECK-NEXT: NOPS + ; CHECK-NEXT: NOPXM + ; CHECK-NEXT: NOPV ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOPB + ; CHECK-NEXT: NOPA + ; CHECK-NEXT: NOPS + ; CHECK-NEXT: NOPXM + ; CHECK-NEXT: NOPV ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOPB + ; CHECK-NEXT: NOPA + ; CHECK-NEXT: NOPS + ; CHECK-NEXT: NOPXM + ; CHECK-NEXT: NOPV ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOPB + ; CHECK-NEXT: NOPA + ; CHECK-NEXT: NOPS + ; CHECK-NEXT: NOPXM + ; CHECK-NEXT: NOPV ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { ; CHECK-NEXT: NOP diff --git a/llvm/test/CodeGen/AIE/aie2/elongate/zol_elongate2.mir b/llvm/test/CodeGen/AIE/aie2/elongate/zol_elongate2.mir index 0a264a16cecc..20526f5c8df9 100644 --- a/llvm/test/CodeGen/AIE/aie2/elongate/zol_elongate2.mir +++ b/llvm/test/CodeGen/AIE/aie2/elongate/zol_elongate2.mir @@ -252,6 +252,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BUNDLE implicit-def $r6 { ; CHECK-NEXT: renamable $r6 = MOVA_lda_cg -8 + ; CHECK-NEXT: NOPB ; CHECK-NEXT: NOPXM ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE implicit-def dead $r6, implicit-def $r0, implicit killed $r0 { @@ -268,53 +269,25 @@ body: | ; CHECK-NEXT: $le = MOVXM_lng_cg ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOPB - ; CHECK-NEXT: NOPA - ; CHECK-NEXT: NOPS - ; CHECK-NEXT: NOPXM - ; CHECK-NEXT: NOPV + ; CHECK-NEXT: NOP ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOPB - ; CHECK-NEXT: NOPA - ; CHECK-NEXT: NOPS - ; CHECK-NEXT: NOPXM - ; CHECK-NEXT: NOPV + ; CHECK-NEXT: NOP ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOPB - ; CHECK-NEXT: NOPA - ; CHECK-NEXT: NOPS - ; CHECK-NEXT: NOPXM - ; CHECK-NEXT: NOPV + ; CHECK-NEXT: NOP ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOPB - ; CHECK-NEXT: NOPA - ; CHECK-NEXT: NOPS - ; CHECK-NEXT: NOPXM - ; CHECK-NEXT: NOPV + ; CHECK-NEXT: NOP ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOPB - ; CHECK-NEXT: NOPA - ; CHECK-NEXT: NOPS - ; CHECK-NEXT: NOPXM - ; CHECK-NEXT: NOPV + ; CHECK-NEXT: NOP ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOPB - ; CHECK-NEXT: NOPA - ; CHECK-NEXT: NOPS - ; CHECK-NEXT: NOPXM - ; CHECK-NEXT: NOPV + ; CHECK-NEXT: NOP ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOPB - ; CHECK-NEXT: NOPA - ; CHECK-NEXT: NOPS - ; CHECK-NEXT: NOPXM - ; CHECK-NEXT: NOPV + ; CHECK-NEXT: NOP ; CHECK-NEXT: } ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4.for.body (align 16): @@ -330,25 +303,46 @@ body: | ; CHECK-NEXT: NOPV ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE implicit-def $r8, implicit $r5, implicit $r6 { - ; CHECK-NEXT: NOPA ; CHECK-NEXT: NOPB + ; CHECK-NEXT: NOPA + ; CHECK-NEXT: NOPS ; CHECK-NEXT: renamable $r8 = LSHL renamable $r5, renamable $r6 + ; CHECK-NEXT: NOPM + ; CHECK-NEXT: NOPV ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE implicit-def dead $r8, implicit-def $dj0, implicit $r4, implicit $r1, implicit $r27 { + ; CHECK-NEXT: NOPB + ; CHECK-NEXT: NOPA + ; CHECK-NEXT: NOPS ; CHECK-NEXT: renamable $r8 = SELNEZ renamable $r4, renamable $r1, renamable $r27 ; CHECK-NEXT: $dj0 = MOV_mv_scl internal killed $r8 + ; CHECK-NEXT: NOPV ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOPB + ; CHECK-NEXT: NOPA + ; CHECK-NEXT: NOPS + ; CHECK-NEXT: NOPXM + ; CHECK-NEXT: NOPV ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOPB + ; CHECK-NEXT: NOPA + ; CHECK-NEXT: NOPS + ; CHECK-NEXT: NOPXM + ; CHECK-NEXT: NOPV ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOPB + ; CHECK-NEXT: NOPA + ; CHECK-NEXT: NOPS + ; CHECK-NEXT: NOPXM + ; CHECK-NEXT: NOPV ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOPA + ; CHECK-NEXT: NOPB + ; CHECK-NEXT: NOPX ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE implicit-def $r7, implicit-def $srcarry, implicit killed $r7, implicit killed $r8 { ; CHECK-NEXT: renamable $r7 = nsw ADD killed renamable $r7, killed renamable $r8, implicit-def $srcarry @@ -617,6 +611,7 @@ body: | ; CHECK-NEXT: NOPV ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { + ; CHECK-NEXT: NOPA ; CHECK-NEXT: $lc = MOV_mv_scl $r5 ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { @@ -626,53 +621,25 @@ body: | ; CHECK-NEXT: $le = MOVXM_lng_cg ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOPB - ; CHECK-NEXT: NOPA - ; CHECK-NEXT: NOPS - ; CHECK-NEXT: NOPXM - ; CHECK-NEXT: NOPV + ; CHECK-NEXT: NOP ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOPB - ; CHECK-NEXT: NOPA - ; CHECK-NEXT: NOPS - ; CHECK-NEXT: NOPXM - ; CHECK-NEXT: NOPV + ; CHECK-NEXT: NOP ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOPB - ; CHECK-NEXT: NOPA - ; CHECK-NEXT: NOPS - ; CHECK-NEXT: NOPXM - ; CHECK-NEXT: NOPV + ; CHECK-NEXT: NOP ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOPB - ; CHECK-NEXT: NOPA - ; CHECK-NEXT: NOPS - ; CHECK-NEXT: NOPXM - ; CHECK-NEXT: NOPV + ; CHECK-NEXT: NOP ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOPB - ; CHECK-NEXT: NOPA - ; CHECK-NEXT: NOPS - ; CHECK-NEXT: NOPXM - ; CHECK-NEXT: NOPV + ; CHECK-NEXT: NOP ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOPB - ; CHECK-NEXT: NOPA - ; CHECK-NEXT: NOPS - ; CHECK-NEXT: NOPXM - ; CHECK-NEXT: NOPV + ; CHECK-NEXT: NOP ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOPB - ; CHECK-NEXT: NOPA - ; CHECK-NEXT: NOPS - ; CHECK-NEXT: NOPXM - ; CHECK-NEXT: NOPV + ; CHECK-NEXT: NOP ; CHECK-NEXT: } ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.7.for.body.epil (align 16): @@ -688,23 +655,44 @@ body: | ; CHECK-NEXT: NOPV ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE implicit-def $r6, implicit $r7, implicit $r0 { + ; CHECK-NEXT: NOPB + ; CHECK-NEXT: NOPA + ; CHECK-NEXT: NOPS ; CHECK-NEXT: renamable $r6 = LSHL renamable $r7, renamable $r0 + ; CHECK-NEXT: NOPM + ; CHECK-NEXT: NOPV ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE implicit-def dead $r6, implicit-def $dj0, implicit $r4, implicit $r1, implicit $r27 { + ; CHECK-NEXT: NOPB + ; CHECK-NEXT: NOPA + ; CHECK-NEXT: NOPS ; CHECK-NEXT: renamable $r6 = SELNEZ renamable $r4, renamable $r1, renamable $r27 ; CHECK-NEXT: $dj0 = MOV_mv_scl internal killed $r6 + ; CHECK-NEXT: NOPV ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOPB + ; CHECK-NEXT: NOPA + ; CHECK-NEXT: NOPS + ; CHECK-NEXT: NOPXM + ; CHECK-NEXT: NOPV ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOPB + ; CHECK-NEXT: NOPA + ; CHECK-NEXT: NOPS + ; CHECK-NEXT: NOPXM + ; CHECK-NEXT: NOPV ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOPB + ; CHECK-NEXT: NOPA + ; CHECK-NEXT: NOPS + ; CHECK-NEXT: NOPXM + ; CHECK-NEXT: NOPV ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE { - ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOPX ; CHECK-NEXT: } ; CHECK-NEXT: BUNDLE implicit-def $r5, implicit-def $srcarry, implicit killed $r5, implicit killed $r6 { ; CHECK-NEXT: renamable $r5 = nsw ADD killed renamable $r5, killed renamable $r6, implicit-def $srcarry diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll index f9eb1b2b494d..41850b316624 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll @@ -98,32 +98,31 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm ; ASM-NEXT: mova r0, #0 // Delay Slot 1 ; ASM-NEXT: .p2align 4 ; ASM-NEXT: .LBB0_2: // %entry.new -; ASM-NEXT: nopa ; nopb ; movxm ls, #.LBB0_3 -; ASM-NEXT: vlda.ups.s32.d8 cm0, s1, [p1], m1; mov dc0, #0 -; ASM-NEXT: vlda.ups.s32.d8 cm1, s1, [p1], m1; movx r6, #-4; mov dc4, dc0 -; ASM-NEXT: vlda.3d.ups.s32.d8 cm2, s1, [p2], d0; movxm le, #.L_LEnd0 -; ASM-NEXT: vlda.ups.s32.d8 cm3, s1, [p1], m1; and r2, r2, r6; mov crUPSSign, r3 -; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; add r2, r2, #-4; mov r6, #-2 -; ASM-NEXT: vlda.ups.s32.d8 cm4, s1, [p1], m1; lshl r2, r2, r6; mov s1, r1 -; ASM-NEXT: vlda.3d.ups.s32.d8 cm7, s1, [p2], d0; add r2, r2, #1 -; ASM-NEXT: vlda.3d.ups.s32.d8 cm5, s1, [p2], d0; add.nc lc, r2, #-1 -; ASM-NEXT: nopb ; nopa ; nops ; nopxm ; nopv -; ASM-NEXT: nopb ; nopa ; nops ; nopxm ; nopv -; ASM-NEXT: nopb ; nopa ; nops ; nopxm ; nopv -; ASM-NEXT: nopb ; nopa ; nops ; nopxm ; nopv -; ASM-NEXT: nopb ; nopa ; nops ; nopxm ; nopv -; ASM-NEXT: nopb ; nopa ; nops ; nopx ; mov crSRSSign, r4; nopv -; ASM-NEXT: nopb ; nopa ; nops ; nopx ; mov s0, r5; nopv +; ASM-NEXT: nopb ; vlda.ups.s32.d8 cm0, s1, [p1], m1; nops ; nopx ; mov dc0, #0; nopv +; ASM-NEXT: vlda.ups.s32.d8 cm1, s1, [p1], m1; mov dc4, dc0 +; ASM-NEXT: vlda.3d.ups.s32.d8 cm2, s1, [p2], d0 +; ASM-NEXT: vlda.ups.s32.d8 cm3, s1, [p1], m1 +; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; mov crUPSSign, r3 +; ASM-NEXT: vlda.ups.s32.d8 cm4, s1, [p1], m1; mov s1, r1 +; ASM-NEXT: vlda.3d.ups.s32.d8 cm7, s1, [p2], d0 +; ASM-NEXT: vlda.3d.ups.s32.d8 cm5, s1, [p2], d0 +; ASM-NEXT: movxm ls, #.LBB0_3 +; ASM-NEXT: mova r6, #-4; movxm le, #.L_LEnd0 +; ASM-NEXT: and r2, r2, r6 +; ASM-NEXT: mova r6, #-2; add r2, r2, #-4 +; ASM-NEXT: lshl r2, r2, r6; mov crSRSSign, r4 +; ASM-NEXT: add r2, r2, #1; mov s0, r5 +; ASM-NEXT: add.nc lc, r2, #-1 ; ASM-NEXT: .p2align 4 ; ASM-NEXT: .LBB0_3: // %for.body ; ASM-NEXT: // =>This Inner Loop Header: Depth=1 -; ASM-NEXT: nopx ; vadd cm8, cm2, cm0, r0 -; ASM-NEXT: vlda.ups.s32.d8 cm0, s1, [p1], m1; vadd cm2, cm6, cm1, r0 -; ASM-NEXT: vlda.ups.s32.d8 cm1, s1, [p1], m1; vadd cm6, cm7, cm3, r0 -; ASM-NEXT: vlda.ups.s32.d8 cm3, s1, [p1], m1; vadd cm4, cm5, cm4, r0 -; ASM-NEXT: vlda.3d.ups.s32.d8 cm2, s1, [p2], d0 -; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0 -; ASM-NEXT: vlda.3d.ups.s32.d8 cm7, s1, [p2], d0; vst.srs.d8.s32 cm8, s0, [p3], #32 +; ASM-NEXT: nopb ; nopa ; nops ; nopxm ; vadd cm8, cm2, cm0, r0 +; ASM-NEXT: nopb ; vlda.ups.s32.d8 cm0, s1, [p1], m1; nops ; nopxm ; vadd cm2, cm6, cm1, r0 +; ASM-NEXT: nopb ; vlda.ups.s32.d8 cm1, s1, [p1], m1; nops ; nopxm ; vadd cm6, cm7, cm3, r0 +; ASM-NEXT: nopb ; vlda.ups.s32.d8 cm3, s1, [p1], m1; nops ; nopxm ; vadd cm4, cm5, cm4, r0 +; ASM-NEXT: nopb ; vlda.3d.ups.s32.d8 cm2, s1, [p2], d0; nops ; nopxm ; nopv +; ASM-NEXT: nopb ; vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; nops ; nopxm ; nopv +; ASM-NEXT: vlda.3d.ups.s32.d8 cm7, s1, [p2], d0; nopb ; nopxm ; vst.srs.d8.s32 cm8, s0, [p3], #32 ; ASM-NEXT: vlda.ups.s32.d8 cm4, s1, [p1], m1; vst.srs.d8.s32 cm2, s0, [p3], #32 ; ASM-NEXT: vlda.3d.ups.s32.d8 cm5, s1, [p2], d0; vst.srs.d8.s32 cm6, s0, [p3], #32 ; ASM-NEXT: vst.srs.d8.s32 cm4, s0, [p3], #32 diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll index 20363bcb458f..3fdd8183e818 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red-swp.ll @@ -469,26 +469,26 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ZOL-NEXT: vlda.ups.s32.s16 bml7, s0, [p2], m5; movxm ls, #.LBB0_2 ; ZOL-NEXT: vldb wl5, [p0], m6; mov r1, p0 ; ZOL-NEXT: vldb wh5, [p0], m6; movxm le, #.L_LEnd0 -; ZOL-NEXT: vlda.ups.s32.s16 bmh6, s0, [p2, #32]; and r0, r0, r9; add.nc lc, r5, #-2 -; ZOL-NEXT: vldb wl3, [p0], m6; nopa ; nops ; add r0, r0, #33; nopm ; nopv -; ZOL-NEXT: vldb.3d wh3, [p0], d0; nopa ; nops ; nopx ; vshift.align x4, x4, s1, x3, r0; nopv -; ZOL-NEXT: nopb ; vlda.ups.s32.s16 bml6, s0, [p2, #0]; nops ; and r1, r1, r9; vshift.align x2, x2, s1, x7, r0; nopv -; ZOL-NEXT: vldb wh1, [p1], #32; nopa ; nops ; add r0, r1, #33; mov r1, p0; nopv -; ZOL-NEXT: vldb wl10, [p1], #32; nopa ; nops ; nopx ; vshuffle x7, x4, x2, r2; nopv -; ZOL-NEXT: vldb wh10, [p1], #32; nopa ; nops ; nopx ; vshuffle x9, x7, x0, r8; nopv -; ZOL-NEXT: nopb ; nopa ; nops ; and r1, r1, r9; nopm ; nopv +; ZOL-NEXT: vlda.ups.s32.s16 bmh6, s0, [p2, #32]; and r0, r0, r9 +; ZOL-NEXT: vldb wl3, [p0], m6; add r0, r0, #33 +; ZOL-NEXT: vldb.3d wh3, [p0], d0; vshift.align x4, x4, s1, x3, r0 +; ZOL-NEXT: vlda.ups.s32.s16 bml6, s0, [p2, #0]; and r1, r1, r9; vshift.align x2, x2, s1, x7, r0 +; ZOL-NEXT: vldb wh1, [p1], #32; add r0, r1, #33; mov r1, p0 +; ZOL-NEXT: vldb wl10, [p1], #32; vshuffle x7, x4, x2, r2 +; ZOL-NEXT: vldb wh10, [p1], #32; vshuffle x9, x7, x0, r8 +; ZOL-NEXT: and r1, r1, r9; add.nc lc, r5, #-2 ; ZOL-NEXT: .p2align 4 ; ZOL-NEXT: .LBB0_2: // %inner.loop ; ZOL-NEXT: // Parent Loop BB0_1 Depth=1 ; ZOL-NEXT: // => This Inner Loop Header: Depth=2 -; ZOL-NEXT: nopa ; nopx ; vshuffle x9, x4, x2, r3; vmac cm1, cm1, x9, x6, r4 -; ZOL-NEXT: vldb wl5, [p0], m6; vshift.align x4, x4, s1, x5, r0; vmac cm4, cm4, x9, x8, r4 -; ZOL-NEXT: vldb wh5, [p0], m6; vshift.align x2, x2, s1, x3, r0 -; ZOL-NEXT: vldb wl3, [p0], m6; vshuffle x11, x9, x0, r8; vmac cm0, cm0, x7, x6, r4 -; ZOL-NEXT: vldb.3d wh3, [p0], d0; vshuffle x7, x4, x2, r2; vmac cm5, cm5, x7, x8, r4 -; ZOL-NEXT: vldb wl1, [p1], #32; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4 -; ZOL-NEXT: vldb wh1, [p1], #32; vmov x6, x1; vmac cm7, cm7, x9, x8, r4 -; ZOL-NEXT: vldb wl10, [p1], #32; add r0, r1, #33; mov r1, p0; vmac cm3, cm3, x11, x6, r4 +; ZOL-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x9, x4, x2, r3; vmac cm1, cm1, x9, x6, r4 +; ZOL-NEXT: vldb wl5, [p0], m6; nopa ; nops ; nopx ; vshift.align x4, x4, s1, x5, r0; vmac cm4, cm4, x9, x8, r4 +; ZOL-NEXT: vldb wh5, [p0], m6; nopa ; nops ; nopx ; vshift.align x2, x2, s1, x3, r0; nopv +; ZOL-NEXT: vldb wl3, [p0], m6; nopa ; nops ; nopx ; vshuffle x11, x9, x0, r8; vmac cm0, cm0, x7, x6, r4 +; ZOL-NEXT: vldb.3d wh3, [p0], d0; nopa ; nops ; nopx ; vshuffle x7, x4, x2, r2; vmac cm5, cm5, x7, x8, r4 +; ZOL-NEXT: vldb wl1, [p1], #32; nopa ; nops ; nopx ; vshuffle x9, x7, x0, r8; vmac cm2, cm2, x9, x6, r4 +; ZOL-NEXT: vldb wh1, [p1], #32; nopa ; nops ; nopx ; vmov x6, x1; vmac cm7, cm7, x9, x8, r4 +; ZOL-NEXT: vldb wl10, [p1], #32; nopa ; nops ; add r0, r1, #33; mov r1, p0; vmac cm3, cm3, x11, x6, r4 ; ZOL-NEXT: .L_LEnd0: ; ZOL-NEXT: vldb wh10, [p1], #32; nopa ; nops ; and r1, r1, r9; vmov x8, x10; vmac cm6, cm6, x11, x8, r4 ; ZOL-NEXT: // %bb.3: // in Loop: Header=BB0_1 Depth=1 diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll index 4a58268d3241..3c386b11e2d8 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll @@ -62,8 +62,8 @@ define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr non ; CHECK-LABEL: TanhTemplated: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %for.body.lr.ph -; CHECK-NEXT: nop ; movxm r3, #16512 -; CHECK-NEXT: movxm r4, #-16256 +; CHECK-NEXT: nopb ; nopa ; nops ; movxm r3, #16512; nopv +; CHECK-NEXT: nopa ; movxm r4, #-16256 ; CHECK-NEXT: movxm r5, #32767 ; CHECK-NEXT: movxm r0, #16256 ; CHECK-NEXT: movxm r1, #16384 @@ -97,26 +97,26 @@ define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr non ; CHECK-NEXT: nop ; CHECK-NEXT: vmov wh5, wl2; vmac.f bmh3, bmh0, x3, x4, r1 ; CHECK-NEXT: vmul.f bmh5, x0, x7, r1 -; CHECK-NEXT: movxm ls, #.LBB0_1; vmac.f bmh6, bmh0, x5, x4, r1 -; CHECK-NEXT: vconv.bf16.fp32 wl7, bmh2; movxm le, #.L_LEnd0; vmul.f bmh7, x0, x7, r1 -; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh4; add.nc lc, r2, #-2 -; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmsc.f bmh3, bmh3, x7, x3, r1 -; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmsc.f bml4, bmh6, x3, x5, r1 -; CHECK-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl3, bmh5; nopxm ; nopv -; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv -; CHECK-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl5, bmh7; nopx ; vmin_ge.bf16 x3, r16, x3, x1; nopv -; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmax_lt.bf16 x3, r16, x3, x10; nopv -; CHECK-NEXT: nopb ; mova r0, #28; vconv.bf16.fp32 wl7, bmh3; nopx ; vmin_ge.bf16 x11, r16, x5, x1; nopv +; CHECK-NEXT: vmac.f bmh6, bmh0, x5, x4, r1 +; CHECK-NEXT: vconv.bf16.fp32 wl7, bmh2; vmul.f bmh7, x0, x7, r1 +; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh4 +; CHECK-NEXT: vmsc.f bmh3, bmh3, x7, x3, r1 +; CHECK-NEXT: movxm ls, #.LBB0_1; vmsc.f bml4, bmh6, x3, x5, r1 +; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh5; movxm le, #.L_LEnd0 +; CHECK-NEXT: add.nc lc, r2, #-2 +; CHECK-NEXT: vconv.bf16.fp32 wl5, bmh7; vmin_ge.bf16 x3, r16, x3, x1 +; CHECK-NEXT: vmax_lt.bf16 x3, r16, x3, x10 +; CHECK-NEXT: mova r0, #28; vconv.bf16.fp32 wl7, bmh3; vmin_ge.bf16 x11, r16, x5, x1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vband x9, x8, x3; nopv -; CHECK-NEXT: vldb wl7, [p0], #32; vmov wh3, wl2 -; CHECK-NEXT: nopx ; vmov wh9, wl2; vmul.f bmh5, x7, x0, r1 -; CHECK-NEXT: vconv.bf16.fp32 wl7, bml4; vldb wl7, [p0], #32; vmax_lt.bf16 x5, r16, x11, x10; vmac.f bmh4, bmh0, x3, x4, r1 -; CHECK-NEXT: vband x9, x8, x5; vmul.f bmh2, x6, x9, r1 -; CHECK-NEXT: vmov wh9, wl2; vmul.f bmh6, x7, x0, r1 -; CHECK-NEXT: vsub.f bml0, bmh5, bmh1, r0 +; CHECK-NEXT: vldb wl7, [p0], #32; nopa ; nops ; nopx ; vmov wh3, wl2; nopv +; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmov wh9, wl2; vmul.f bmh5, x7, x0, r1 +; CHECK-NEXT: vldb wl7, [p0], #32; nopa ; vconv.bf16.fp32 wl7, bml4; nopx ; vmax_lt.bf16 x5, r16, x11, x10; vmac.f bmh4, bmh0, x3, x4, r1 +; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vband x9, x8, x5; vmul.f bmh2, x6, x9, r1 +; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmov wh9, wl2; vmul.f bmh6, x7, x0, r1 +; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vsub.f bml0, bmh5, bmh1, r0 ; CHECK-NEXT: vmul.f bmh3, x6, x9, r1 ; CHECK-NEXT: vmul.f bmh7, x0, x7, r1 ; CHECK-NEXT: vmov wh5, wl2; vsub.f bml1, bmh6, bmh1, r0 diff --git a/llvm/test/CodeGen/AIE/aie2/hardware-loops/zol-loop.ll b/llvm/test/CodeGen/AIE/aie2/hardware-loops/zol-loop.ll index e25c7995dc8e..44076822c592 100644 --- a/llvm/test/CodeGen/AIE/aie2/hardware-loops/zol-loop.ll +++ b/llvm/test/CodeGen/AIE/aie2/hardware-loops/zol-loop.ll @@ -22,26 +22,19 @@ define void @simple_loop(i32 noundef %n, ptr nocapture readonly %in, ptr nocaptu ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: add.nc lc, r0, #0 -; CHECK-NEXT: movxm ls, #.LBB0_2 -; CHECK-NEXT: movxm le, #.L_LEnd0 -; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv -; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv +; CHECK-NEXT: nopa ; nopb ; nopx ; add.nc lc, r0, #0 +; CHECK-NEXT: mova r2, #1; movxm ls, #.LBB0_2 +; CHECK-NEXT: mova r0, #2; movxm le, #.L_LEnd0 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_2: // %for.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: nopb ; lda r3, [p0, #0]; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv -; CHECK-NEXT: nopb ; mova r0, #2; nops ; movx r2, #1; nopm ; nopv -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: .LBB0_2: // %for.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lda r3, [p0, #0]; nopb ; nopx -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: lshl r4, r1, r0 -; CHECK-NEXT: add r1, r1, #1 +; CHECK-NEXT: nopb ; nopa ; nops ; lshl r4, r1, r0; nopm ; nopv +; CHECK-NEXT: nopa ; nopb ; add r1, r1, #1 ; CHECK-NEXT: add r3, r2, r3; mov dj0, r4 ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: nopb ; nopa ; st r3, [p1, dj0]; add r2, r2, #-1; nopm ; nopv @@ -85,7 +78,6 @@ define i32 @static_bounded_loop(i32 %num) { ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv -; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov r0, r1; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_1: // %for.body diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/add-store-notc.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/add-store-notc.mir index cbf87c7d32fd..f86c35ccfe04 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/add-store-notc.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/add-store-notc.mir @@ -32,7 +32,6 @@ ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; movx r1, #0; nopm ; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/add-store-tc1.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/add-store-tc1.mir index 0048488b2ce9..a7017c3d4c50 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/add-store-tc1.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/add-store-tc1.mir @@ -32,7 +32,6 @@ ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; movx r1, #0; nopm ; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/add-store-updated.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/add-store-updated.mir index 018d7072cd97..22df9a1bc844 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/add-store-updated.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/add-store-updated.mir @@ -33,7 +33,6 @@ ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; movx r1, #0; nopm ; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/add-store.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/add-store.mir index cf0e7cadb10f..f0c7699a1ddc 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/add-store.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/add-store.mir @@ -12,50 +12,6 @@ --- | define dso_local void @addStore(ptr addrspace(5) noalias nocapture writeonly %d, i32 noundef %n) local_unnamed_addr #0 { - ; CHECK-LABEL: addStore: - ; CHECK: .p2align 4 - ; CHECK-NEXT: // %bb.0: // %entry - ; CHECK-NEXT: mova r1, #0; nopb ; nopxm - ; CHECK-NEXT: ge r1, r1, r0 - ; CHECK-NEXT: jnz r1, #.LBB0_4 - ; CHECK-NEXT: nop // Delay Slot 5 - ; CHECK-NEXT: nop // Delay Slot 4 - ; CHECK-NEXT: nop // Delay Slot 3 - ; CHECK-NEXT: nop // Delay Slot 2 - ; CHECK-NEXT: nop // Delay Slot 1 - ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: add.nc lc, r0, #-1 - ; CHECK-NEXT: movxm ls, #.LBB0_2 - ; CHECK-NEXT: movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; movx r1, #0; nopm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; add r1, r1, #1; nopm ; nopv - ; CHECK-NEXT: .p2align 4 - ; CHECK-NEXT: .LBB0_2: // %for.body - ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: nopb ; nopa ; st r1, [p0], #4; add r1, r1, #1; nopm ; nopv - ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup - ; CHECK-NEXT: nopb ; nopa ; st r1, [p0], #4; nopxm ; nopv - ; CHECK-NEXT: nopx - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: .p2align 4 - ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup - ; CHECK-NEXT: nopa ; ret lr - ; CHECK-NEXT: nop // Delay Slot 5 - ; CHECK-NEXT: nop // Delay Slot 4 - ; CHECK-NEXT: nop // Delay Slot 3 - ; CHECK-NEXT: nop // Delay Slot 2 - ; CHECK-NEXT: nop // Delay Slot 1 entry: %cmp5 = icmp sgt i32 %n, 0 br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup @@ -136,3 +92,5 @@ body: | DelayedSchedBarrier ... +## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +# CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/bitwisenot.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/bitwisenot.mir index a0b6a1c2738f..b24ad21d6e7c 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/bitwisenot.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/bitwisenot.mir @@ -13,10 +13,10 @@ define dso_local void @bitNot(ptr addrspace(5) noalias nocapture writeonly %d, ptr addrspace(6) noalias nocapture readonly %s, i32 noundef %n) local_unnamed_addr #0 { ; CHECK-LABEL: bitNot: ; CHECK: // %bb.0: - ; CHECK-NEXT: nopa ; vldb wh0, [p0, #32]; nopx ; add.nc lc, r0, #-5 - ; CHECK-NEXT: vldb wl0, [p0], #64; movxm ls, #.LBB0_2 - ; CHECK-NEXT: vldb wh0, [p0, #32]; movxm le, #.L_LEnd0 - ; CHECK-NEXT: vldb wl0, [p0], #64; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh0, [p0, #32]; nopx + ; CHECK-NEXT: vldb wl0, [p0], #64; add.nc lc, r0, #-5 + ; CHECK-NEXT: vldb wh0, [p0, #32]; movxm ls, #.LBB0_2 + ; CHECK-NEXT: vldb wl0, [p0], #64; movxm le, #.L_LEnd0 ; CHECK-NEXT: vldb wh0, [p0, #32]; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: vldb wl0, [p0], #64; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: vldb wh0, [p0, #32]; nopa ; nops ; nopxm ; nopv diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/bitwisexor.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/bitwisexor.mir index 3c8f3fda736f..300a4a7ae621 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/bitwisexor.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/bitwisexor.mir @@ -38,16 +38,16 @@ ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vbneg_ltz.s8 x3, r25:r24, x1; nopv ; CHECK-NEXT: vldb wh0, [p1, #32]; vlda wh1, [p0, #32]; nops ; nopx ; vband x4, x0, x3; nopv ; CHECK-NEXT: vldb wl0, [p1], #64; vlda wl1, [p0], #64; nops ; nopx ; vband x5, x1, x2; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vbor x6, x4, x5; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vbneg_ltz.s8 x2, r25:r24, x0; nopv - ; CHECK-NEXT: nopb ; nopa ; vst wh6, [p2, #32]; nopx ; vbneg_ltz.s8 x3, r25:r24, x1; nopv + ; CHECK-NEXT: vbor x6, x4, x5 + ; CHECK-NEXT: vbneg_ltz.s8 x2, r25:r24, x0 + ; CHECK-NEXT: vst wh6, [p2, #32]; vbneg_ltz.s8 x3, r25:r24, x1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: vlda wh1, [p0, #32]; vldb wh0, [p1, #32]; nopx ; vband x4, x0, x3; vst wl6, [p2], #64 - ; CHECK-NEXT: vlda wl1, [p0], #64; vldb wl0, [p1], #64; vband x5, x1, x2 - ; CHECK-NEXT: vbor x6, x4, x5 - ; CHECK-NEXT: vbneg_ltz.s8 x2, r25:r24, x0 + ; CHECK-NEXT: vldb wh0, [p1, #32]; vlda wh1, [p0, #32]; vst wl6, [p2], #64; nopx ; vband x4, x0, x3; nopv + ; CHECK-NEXT: vldb wl0, [p1], #64; vlda wl1, [p0], #64; nops ; nopx ; vband x5, x1, x2; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vbor x6, x4, x5; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vbneg_ltz.s8 x2, r25:r24, x0; nopv ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: nopb ; nopa ; vst wh6, [p2, #32]; nopx ; vbneg_ltz.s8 x3, r25:r24, x1; nopv ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d.mir index 36c37ad7e30e..1e46e2337dd3 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d.mir @@ -12,155 +12,6 @@ --- | define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %cond, ptr %cond.i50, <16 x i32> %0, i32 %cond67.i79, i20 %idx.ext.i.i81, i20 %idx.ext.i404.i, i20 %idx.ext.i410.i, i20 %idx.ext.i434.i85, i32 %1, i20 %2, i20 %3, i20 %4, i20 %5, i20 %6, i32 %7, i32 %8, i32 %or9.i.i.i.i.i96, i32 %9, i20 %idx.ext.i422.i82, i20 %10, i20 %11, i20 %12, i20 %13, i20 %14, i20 %15, i20 %16, i20 %17, i20 %18, i20 %19, i20 %20, i20 %21, i20 %22, i20 %23, i32 %conv192.i107, i20 %24, i20 %idx.ext.i428.i, i20 %25, i20 %26, i20 %27, i32 %28) #0 { - ; CHECK-LABEL: conv2d.loop.nest: - ; CHECK: .p2align 4 - ; CHECK-NEXT: // %bb.0: // %newFuncRoot - ; CHECK-NEXT: nopa ; paddb [sp], #32; nopx - ; CHECK-NEXT: st p6, [sp, #-28] // 4-byte Folded Spill - ; CHECK-NEXT: mov p6, sp - ; CHECK-NEXT: paddb [p6], #-132 - ; CHECK-NEXT: lda m5, [p6, #0]; mov p6, sp - ; CHECK-NEXT: paddb [p6], #-136 - ; CHECK-NEXT: lda r28, [p6, #0]; mov p6, sp - ; CHECK-NEXT: paddb [p6], #-140 - ; CHECK-NEXT: lda r27, [p6, #0]; mov p6, sp - ; CHECK-NEXT: paddb [p6], #-44 - ; CHECK-NEXT: lda m0, [p6, #0]; mov p6, sp - ; CHECK-NEXT: paddb [p6], #-48 - ; CHECK-NEXT: lda dj0, [p6, #0]; mov p6, sp - ; CHECK-NEXT: paddb [p6], #-52 - ; CHECK-NEXT: lda dj4, [p6, #0]; mov p6, sp - ; CHECK-NEXT: paddb [p6], #-56 - ; CHECK-NEXT: lda dn0, [p6, #0]; mov p6, sp - ; CHECK-NEXT: paddb [p6], #-60 - ; CHECK-NEXT: lda dn4, [p6, #0]; mov p6, sp - ; CHECK-NEXT: paddb [p6], #-68 - ; CHECK-NEXT: lda r10, [p6, #0]; mov p6, sp - ; CHECK-NEXT: paddb [p6], #-72; mov s0, r0 - ; CHECK-NEXT: lda dj1, [p6, #0]; mov p6, sp - ; CHECK-NEXT: paddb [p6], #-76; mov s1, r1 - ; CHECK-NEXT: lda r11, [p6, #0]; mov p6, sp - ; CHECK-NEXT: mova dj3, #0; paddb [p6], #-80; mov s2, r6 - ; CHECK-NEXT: lda dn1, [p6, #0]; mov p6, sp - ; CHECK-NEXT: paddb [p6], #-84; mov dc0, dj3 - ; CHECK-NEXT: lda r12, [p6, #0]; mov p6, sp - ; CHECK-NEXT: paddb [p6], #-88; mov dc4, dj3 - ; CHECK-NEXT: lda r13, [p6, #0]; mov p6, sp - ; CHECK-NEXT: paddb [p6], #-92; mov dc1, dj3 - ; CHECK-NEXT: lda dj2, [p6, #0]; mov p6, sp - ; CHECK-NEXT: paddb [p6], #-96; mov r25, dj3 - ; CHECK-NEXT: lda dj6, [p6, #0]; mov p6, sp - ; CHECK-NEXT: paddb [p6], #-100; mov dc2, dj3 - ; CHECK-NEXT: lda dn2, [p6, #0]; mov p6, sp - ; CHECK-NEXT: paddb [p6], #-104; st p7, [sp, #-32] // 4-byte Folded Spill - ; CHECK-NEXT: lda dn6, [p6, #0]; mov p6, sp - ; CHECK-NEXT: paddb [p6], #-108; mov p7, sp - ; CHECK-NEXT: lda r14, [p6, #0]; paddb [p7], #-112; mov p6, sp - ; CHECK-NEXT: lda dj7, [p7, #0]; paddb [p6], #-116; mov p7, sp - ; CHECK-NEXT: lda dn3, [p6, #0]; paddb [p7], #-40; mov p6, sp - ; CHECK-NEXT: lda m6, [p7, #0]; paddb [p6], #-120; mov dc6, dj3 - ; CHECK-NEXT: lda dn7, [p6, #0]; mov p6, sp - ; CHECK-NEXT: paddb [p6], #-36; mov dc3, dj3 - ; CHECK-NEXT: lda r15, [p6, #0]; mov p6, sp - ; CHECK-NEXT: paddb [p6], #-64; mov p7, sp - ; CHECK-NEXT: lda r24, [p6, #0]; paddb [p7], #-128; mov p6, sp - ; CHECK-NEXT: lda m7, [p7, #0]; paddb [p6], #-124; movx r8, #11; mov dc7, dj3 - ; CHECK-NEXT: lda m4, [p6, #0]; movx r9, #31; mov r26, dj3 - ; CHECK-NEXT: // implicit-def: $x4 - ; CHECK-NEXT: // implicit-def: $x2 - ; CHECK-NEXT: .p2align 4 - ; CHECK-NEXT: .LBB0_1: // %outer.loop.header - ; CHECK-NEXT: // =>This Loop Header: Depth=1 - ; CHECK-NEXT: // Child Loop BB0_2 Depth 2 - ; CHECK-NEXT: nopb ; vlda.ups.s32.s16 bmh1, s0, [p2, #32]; nops ; nopx ; mov m1, p4; nopv - ; CHECK-NEXT: vlda.ups.s32.s16 bml1, s0, [p2], m1 - ; CHECK-NEXT: vlda.ups.s32.s16 bmh2, s0, [p2, #32]; mov m2, p5 - ; CHECK-NEXT: vlda.ups.s32.s16 bml2, s0, [p2], m2 - ; CHECK-NEXT: vlda.ups.s32.s16 bmh3, s0, [p2, #32] - ; CHECK-NEXT: vlda.ups.s32.s16 bml3, s0, [p2], m1 - ; CHECK-NEXT: vlda.ups.s32.s16 bmh4, s0, [p2, #32]; mov m3, r15 - ; CHECK-NEXT: vlda.ups.s32.s16 bml4, s0, [p2], m3 - ; CHECK-NEXT: vlda.ups.s32.s16 bmh5, s0, [p2, #32] - ; CHECK-NEXT: vlda.ups.s32.s16 bml5, s0, [p2], m1 - ; CHECK-NEXT: vlda.ups.s32.s16 bmh6, s0, [p2, #32] - ; CHECK-NEXT: vlda.ups.s32.s16 bml6, s0, [p2], m2 - ; CHECK-NEXT: vlda.ups.s32.s16 bmh7, s0, [p2, #32] - ; CHECK-NEXT: vlda.ups.s32.s16 bml7, s0, [p2], m1 - ; CHECK-NEXT: vlda.ups.s32.s16 bmh0, s0, [p2, #32] - ; CHECK-NEXT: vlda.ups.s32.s16 bml0, s0, [p2, #0]; mov r0, p0 - ; CHECK-NEXT: vldb wl6, [p0], m6 - ; CHECK-NEXT: vldb wh6, [p0], m6 - ; CHECK-NEXT: vldb wl8, [p0], m6 - ; CHECK-NEXT: vldb.3d wh8, [p0], d0 - ; CHECK-NEXT: vldb wl10, [p1], #32; movxm ls, #.LBB0_2 - ; CHECK-NEXT: vldb wh10, [p1], #32; movxm le, #.L_LEnd0 - ; CHECK-NEXT: vldb wl7, [p1], #32; and r0, r0, r9; add.nc lc, r5, #-2 - ; CHECK-NEXT: vldb wh7, [p1], #32; nopa ; nops ; add r0, r0, #33; mov r6, p0; nopv - ; CHECK-NEXT: vldb wl6, [p0], m6; nopa ; nops ; and r5, r6, r9; vshift.align x4, x4, s1, x6, r0; nopv - ; CHECK-NEXT: vldb wh6, [p0], m6; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wl8, [p0], m6; nopa ; nops ; add r0, r5, #33; vshift.align x2, x2, s1, x8, r0; nopv - ; CHECK-NEXT: vldb.3d wh8, [p0], d0; nopa ; nops ; nopx ; vshuffle x9, x4, x2, r2; nopv - ; CHECK-NEXT: vldb wl10, [p1], #32; nopa ; nops ; nopx ; vshuffle x1, x9, x0, r8; nopv - ; CHECK-NEXT: vldb wh10, [p1], #32; nopx ; vshuffle x3, x4, x2, r3; vmac cm1, cm1, x9, x10, r4 - ; CHECK-NEXT: vldb wl7, [p1], #32; vshuffle x5, x3, x0, r8; vmac cm2, cm2, x1, x10, r4 - ; CHECK-NEXT: vldb wh7, [p1], #32; mov r6, p0; vmac cm3, cm3, x3, x10, r4 - ; CHECK-NEXT: .p2align 4 - ; CHECK-NEXT: .LBB0_2: // %inner.loop - ; CHECK-NEXT: // Parent Loop BB0_1 Depth=1 - ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 - ; CHECK-NEXT: vldb wl6, [p0], m6; nopa ; nops ; and r5, r6, r9; vshift.align x4, x4, s1, x6, r0; vmac cm4, cm4, x5, x10, r4 - ; CHECK-NEXT: nopa ; vldb wh6, [p0], m6; nopx ; vmac cm5, cm5, x9, x7, r4 - ; CHECK-NEXT: vldb wl8, [p0], m6; add r0, r5, #33; vshift.align x2, x2, s1, x8, r0; vmac cm6, cm6, x1, x7, r4 - ; CHECK-NEXT: vldb.3d wh8, [p0], d0; vshuffle x9, x4, x2, r2; vmac cm7, cm7, x3, x7, r4 - ; CHECK-NEXT: vldb wl10, [p1], #32; vshuffle x1, x9, x0, r8; vmac cm0, cm0, x5, x7, r4 - ; CHECK-NEXT: vldb wh10, [p1], #32; vshuffle x3, x4, x2, r3; vmac cm1, cm1, x9, x10, r4 - ; CHECK-NEXT: vldb wl7, [p1], #32; vshuffle x5, x3, x0, r8; vmac cm2, cm2, x1, x10, r4 - ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: vldb wh7, [p1], #32; nopa ; nops ; nopx ; mov r6, p0; vmac cm3, cm3, x3, x10, r4 - ; CHECK-NEXT: // %bb.3: // %outer.loop.latch - ; CHECK-NEXT: // in Loop: Header=BB0_1 Depth=1 - ; CHECK-NEXT: nopb ; nopa ; nops ; and r5, r6, r9; vshift.align x4, x4, s1, x6, r0; vmac cm4, cm4, x5, x10, r4 - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov s3, r6; vmac cm5, cm5, x9, x7, r4 - ; CHECK-NEXT: add r0, r5, #33; vshift.align x2, x2, s1, x8, r0; vmac cm6, cm6, x1, x7, r4 - ; CHECK-NEXT: add r7, r7, #-1; vshuffle x9, x4, x2, r2; vmac cm7, cm7, x3, x7, r4 - ; CHECK-NEXT: vshuffle x1, x9, x0, r8; vmac cm0, cm0, x5, x7, r4 - ; CHECK-NEXT: vshuffle x3, x4, x2, r3; vmac cm1, cm1, x9, x10, r4 - ; CHECK-NEXT: vshuffle x5, x3, x0, r8; vmac cm2, cm2, x1, x10, r4 - ; CHECK-NEXT: mov dc5, r26; vmac cm3, cm3, x3, x10, r4 - ; CHECK-NEXT: mov dn5, r27; vmac cm4, cm4, x5, x10, r4 - ; CHECK-NEXT: mov dj5, r28; vmac cm5, cm5, x9, x7, r4 - ; CHECK-NEXT: vst.srs.s16.s32 bmh1, s2, [p3, #32]; mov m1, r10; vmac cm6, cm6, x1, x7, r4 - ; CHECK-NEXT: vst.srs.s16.s32 bml1, s3, [p3], #64; mov m2, r13; vmac cm7, cm7, x3, x7, r4 - ; CHECK-NEXT: padda.3d [p1], d2; vst.srs.s16.s32 bmh2, s3, [p3, #32]; mov m3, r14; vmac cm0, cm0, x5, x7, r4 - ; CHECK-NEXT: vst.srs.s16.s32 bml2, s3, [p3], m4 - ; CHECK-NEXT: vst.srs.s16.s32 bmh3, s3, [p3, #32] - ; CHECK-NEXT: vst.srs.s16.s32 bml3, s3, [p3], #64 - ; CHECK-NEXT: vst.srs.s16.s32 bmh4, s3, [p3, #32] - ; CHECK-NEXT: vst.srs.s16.s32 bml4, s3, [p3], m7 - ; CHECK-NEXT: vst.srs.s16.s32 bmh5, s3, [p3, #32] - ; CHECK-NEXT: vst.srs.s16.s32 bml5, s3, [p3], #64 - ; CHECK-NEXT: vst.srs.s16.s32 bmh6, s3, [p3, #32] - ; CHECK-NEXT: vst.srs.s16.s32 bml6, s3, [p3], m4 - ; CHECK-NEXT: vst.srs.s16.s32 bmh7, s3, [p3, #32] - ; CHECK-NEXT: vst.srs.s16.s32 bml7, s3, [p3], #64 - ; CHECK-NEXT: vst.srs.s16.s32 bmh0, s3, [p3, #32] - ; CHECK-NEXT: vst.2d.srs.s16.s32 bml0, s3, [p3], d5; mov dj5, r11 - ; CHECK-NEXT: mov dn5, r12 - ; CHECK-NEXT: jnz r7, #.LBB0_1 - ; CHECK-NEXT: mov r26, dc5 // Delay Slot 5 - ; CHECK-NEXT: mov dc5, r25 // Delay Slot 4 - ; CHECK-NEXT: padda.3d [p0], d1; mov m1, r24 // Delay Slot 3 - ; CHECK-NEXT: paddb [p2], m1 // Delay Slot 2 - ; CHECK-NEXT: padda.3d [p2], d3; mov r25, dc5 // Delay Slot 1 - ; CHECK-NEXT: // %bb.4: // %exitStub - ; CHECK-NEXT: lda p7, [sp, #-32]; nopb ; nopxm // 4-byte Folded Reload - ; CHECK-NEXT: lda p6, [sp, #-28] // 4-byte Folded Reload - ; CHECK-NEXT: ret lr - ; CHECK-NEXT: nop // Delay Slot 5 - ; CHECK-NEXT: nop // Delay Slot 4 - ; CHECK-NEXT: nop // Delay Slot 3 - ; CHECK-NEXT: nop // Delay Slot 2 - ; CHECK-NEXT: paddb [sp], #-32 // Delay Slot 1 newFuncRoot: br label %outer.loop.header @@ -773,3 +624,5 @@ body: | DelayedSchedBarrier ... +## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +# CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-1.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-1.mir index ba1c58b94b69..f09f6e5fe49e 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-1.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-1.mir @@ -36,28 +36,28 @@ ; CHECK-NEXT: vldb wh10, [p0, #32] ; CHECK-NEXT: vldb wl10, [p0], m4 ; CHECK-NEXT: vldb wh1, [p0, #32] - ; CHECK-NEXT: vldb wl1, [p0], m4; add.nc lc, r0, #-1 - ; CHECK-NEXT: vldb wh3, [p0, #32]; movxm ls, #.LBB0_2 - ; CHECK-NEXT: vldb.3d wl3, [p0], d1; movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshift.align x0, x0, s0, x8, r3; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshift.align x2, x2, s0, x10, r3; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x5, x0, x2, r25; nopv - ; CHECK-NEXT: vldb wh5, [p5, #32]; nopa ; nops ; nopx ; vshuffle x8, x0, x2, r9; nopv - ; CHECK-NEXT: nopb ; vlda wl5, [p5], #256; nops ; nopx ; vshift.align x4, x4, s0, x1, r3; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshift.align x6, x6, s0, x3, r3; vmac.f bmh1, bmh1, x8, x9, r29 + ; CHECK-NEXT: vldb wl1, [p0], m4 + ; CHECK-NEXT: vldb wh3, [p0, #32]; add.nc lc, r0, #-1 + ; CHECK-NEXT: vldb.3d wl3, [p0], d1; movxm ls, #.LBB0_2 + ; CHECK-NEXT: vshift.align x0, x0, s0, x8, r3 + ; CHECK-NEXT: movxm le, #.L_LEnd0 + ; CHECK-NEXT: vshift.align x2, x2, s0, x10, r3 + ; CHECK-NEXT: vshuffle x5, x0, x2, r25 + ; CHECK-NEXT: vldb wh5, [p5, #32]; vshuffle x8, x0, x2, r9 + ; CHECK-NEXT: vlda wl5, [p5], #256; vshift.align x4, x4, s0, x1, r3 + ; CHECK-NEXT: vshift.align x6, x6, s0, x3, r3; vmac.f bmh1, bmh1, x8, x9, r29 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: vlda wh9, [p4, #416]; vshuffle x10, x4, x6, r25; vmac.f bml4, bml4, x8, x7, r29 - ; CHECK-NEXT: vlda wh7, [p4, #352]; vshuffle x3, x4, x6, r9; vmac.f bmh6, bmh6, x8, x11, r29 - ; CHECK-NEXT: vlda wl7, [p4, #320]; vshuffle x1, x3, x5, r13; vmac.f bmh2, bmh2, x10, x9, r29 - ; CHECK-NEXT: vlda wl9, [p4, #384]; vshuffle x3, x3, x5, r24; vmac.f bml5, bml5, x10, x7, r29 - ; CHECK-NEXT: vlda wh11, [p4, #480]; mov r3, p0; vmac.f bmh0, bmh0, x1, x9, r29 - ; CHECK-NEXT: vlda wl11, [p4, #448]; and r3, r3, r0; mov p7, p5; vmac.f bmh3, bmh3, x3, x9, r29 - ; CHECK-NEXT: vldb wh8, [p0, #32]; add r3, r3, #34; mov p4, p7; vmac.f bml3, bml3, x1, x7, r29 - ; CHECK-NEXT: vldb wl8, [p0], m4; vmac.f bml6, bml6, x3, x7, r29 - ; CHECK-NEXT: vldb wh10, [p0, #32]; vmac.f bmh4, bmh4, x1, x11, r29 + ; CHECK-NEXT: nopb ; vlda wh9, [p4, #416]; nops ; nopx ; vshuffle x10, x4, x6, r25; vmac.f bml4, bml4, x8, x7, r29 + ; CHECK-NEXT: nopb ; vlda wh7, [p4, #352]; nops ; nopx ; vshuffle x3, x4, x6, r9; vmac.f bmh6, bmh6, x8, x11, r29 + ; CHECK-NEXT: nopb ; vlda wl7, [p4, #320]; nops ; nopx ; vshuffle x1, x3, x5, r13; vmac.f bmh2, bmh2, x10, x9, r29 + ; CHECK-NEXT: nopb ; vlda wl9, [p4, #384]; nops ; nopx ; vshuffle x3, x3, x5, r24; vmac.f bml5, bml5, x10, x7, r29 + ; CHECK-NEXT: nopb ; vlda wh11, [p4, #480]; nops ; nopx ; mov r3, p0; vmac.f bmh0, bmh0, x1, x9, r29 + ; CHECK-NEXT: nopb ; vlda wl11, [p4, #448]; nops ; and r3, r3, r0; mov p7, p5; vmac.f bmh3, bmh3, x3, x9, r29 + ; CHECK-NEXT: vldb wh8, [p0, #32]; nopa ; nops ; add r3, r3, #34; mov p4, p7; vmac.f bml3, bml3, x1, x7, r29 + ; CHECK-NEXT: vldb wl8, [p0], m4; nopa ; nops ; nopxm ; vmac.f bml6, bml6, x3, x7, r29 + ; CHECK-NEXT: vldb wh10, [p0, #32]; nopx ; vmac.f bmh4, bmh4, x1, x11, r29 ; CHECK-NEXT: vldb wl10, [p0], m4; vmac.f bml1, bml1, x3, x11, r29 ; CHECK-NEXT: vldb wh1, [p0, #32]; vmac.f bmh8, bmh8, x10, x11, r29 ; CHECK-NEXT: vldb wl1, [p0], m4; vmac.f bmh7, bmh7, x8, x5, r29 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-2.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-2.mir index b00c4711aa3f..f806d9419d61 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-2.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-2.mir @@ -26,7 +26,7 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: vldb wh7, [p5, #32]; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; vldb wh7, [p5, #32]; nopxm ; CHECK-NEXT: vldb wl7, [p5], #64 ; CHECK-NEXT: vldb wh8, [p4, #32] ; CHECK-NEXT: vldb wl8, [p4], m4 @@ -39,23 +39,23 @@ ; CHECK-NEXT: vldb.3d wl3, [p4], d0; vshift.align x0, x0, s0, x8, r21 ; CHECK-NEXT: mov r3, p4 ; CHECK-NEXT: and r3, r3, r6; add.nc lc, r0, #-1 - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshift.align x2, x2, s0, x10, r21; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x5, x0, x2, r25; nopv - ; CHECK-NEXT: vldb wh5, [p5, #32]; nopa ; nops ; nopx ; vshift.align x4, x4, s0, x1, r21; nopv - ; CHECK-NEXT: vldb wl5, [p5], #64; nopa ; nops ; nopx ; vshuffle x8, x0, x2, r10; nopv - ; CHECK-NEXT: vldb wh9, [p5, #32]; nopa ; nops ; add r21, r3, #34; vshift.align x6, x6, s0, x3, r21; nopv - ; CHECK-NEXT: vldb wl9, [p5], #64; nopa ; nops ; nopx ; vshuffle x3, x4, x6, r10; vmac.f bmh4, bmh4, x8, x7, r14 - ; CHECK-NEXT: vldb wh11, [p5, #32]; nopx ; vshuffle x10, x4, x6, r25 + ; CHECK-NEXT: vshift.align x2, x2, s0, x10, r21 + ; CHECK-NEXT: vshuffle x5, x0, x2, r25 + ; CHECK-NEXT: vldb wh5, [p5, #32]; vshift.align x4, x4, s0, x1, r21 + ; CHECK-NEXT: vldb wl5, [p5], #64; vshuffle x8, x0, x2, r10 + ; CHECK-NEXT: vldb wh9, [p5, #32]; add r21, r3, #34; vshift.align x6, x6, s0, x3, r21 + ; CHECK-NEXT: vldb wl9, [p5], #64; vshuffle x3, x4, x6, r10; vmac.f bmh4, bmh4, x8, x7, r14 + ; CHECK-NEXT: vldb wh11, [p5, #32]; vshuffle x10, x4, x6, r25 ; CHECK-NEXT: vldb wl11, [p5], #64; vshuffle x1, x3, x5, r15 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: vldb wh7, [p5, #32]; nopx ; vshuffle x3, x3, x5, r24; vmac.f bml0, bml0, x10, x7, r14 - ; CHECK-NEXT: vldb wl7, [p5], #64; vmac.f bmh3, bmh3, x1, x7, r14 - ; CHECK-NEXT: vldb wh8, [p4, #32]; vmac.f bml5, bml5, x10, x5, r14 - ; CHECK-NEXT: vldb wl8, [p4], m4; vmac.f bml2, bml2, x8, x5, r14 - ; CHECK-NEXT: vmac.f bml1, bml1, x3, x7, r14 - ; CHECK-NEXT: vldb wh10, [p4, #32]; vmac.f bmh2, bmh2, x8, x9, r14 + ; CHECK-NEXT: vldb wh7, [p5, #32]; nopa ; nops ; nopx ; vshuffle x3, x3, x5, r24; vmac.f bml0, bml0, x10, x7, r14 + ; CHECK-NEXT: vldb wl7, [p5], #64; nopa ; nops ; nopxm ; vmac.f bmh3, bmh3, x1, x7, r14 + ; CHECK-NEXT: vldb wh8, [p4, #32]; nopa ; nops ; nopxm ; vmac.f bml5, bml5, x10, x5, r14 + ; CHECK-NEXT: vldb wl8, [p4], m4; nopa ; nops ; nopxm ; vmac.f bml2, bml2, x8, x5, r14 + ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; vmac.f bml1, bml1, x3, x7, r14 + ; CHECK-NEXT: vldb wh10, [p4, #32]; nopa ; nops ; nopxm ; vmac.f bmh2, bmh2, x8, x9, r14 ; CHECK-NEXT: vldb wl10, [p4], m4; vmac.f bml3, bml3, x10, x11, r14 ; CHECK-NEXT: vldb wh1, [p4, #32]; vmac.f bmh7, bmh7, x8, x11, r14 ; CHECK-NEXT: vldb wl1, [p4], m4; vmac.f bmh1, bmh1, x1, x9, r14 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-feasibleRA.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-feasibleRA.mir index 412ebddb8e52..0a04a00c613b 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-feasibleRA.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16-feasibleRA.mir @@ -26,33 +26,33 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: nopa ; vldb wh10, [p4, #32]; nopx + ; CHECK-NEXT: nopa ; vldb wh10, [p4, #32]; nopxm ; CHECK-NEXT: vldb wl10, [p4], m4 ; CHECK-NEXT: vldb wh10, [p5, #32] ; CHECK-NEXT: vldb wh7, [p4, #32] ; CHECK-NEXT: vldb wl7, [p4], m4 ; CHECK-NEXT: vldb wl10, [p5], #64 ; CHECK-NEXT: vldb wh7, [p5, #32] - ; CHECK-NEXT: vldb wh8, [p4, #32]; add.nc lc, r0, #-1 + ; CHECK-NEXT: vldb wh8, [p4, #32] ; CHECK-NEXT: vldb wl8, [p4], m4; vshift.align x3, x3, s0, x10, r21 ; CHECK-NEXT: vldb wh11, [p4, #32]; movxm ls, #.LBB0_2 ; CHECK-NEXT: vldb.3d wl11, [p4], d0; movxm le, #.L_LEnd0 - ; CHECK-NEXT: vldb wl7, [p5], #64; nopa ; nops ; nopx ; vshift.align x2, x2, s0, x7, r21; nopv - ; CHECK-NEXT: vldb wh9, [p5, #32]; nopa ; nops ; nopx ; mov r3, p4; nopv - ; CHECK-NEXT: vldb wl9, [p5], #64; nopa ; nops ; and r3, r3, r6; nopm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x6, x3, x2, r10; nopv - ; CHECK-NEXT: vldb wh4, [p5, #32]; nopa ; nops ; nopx ; vshift.align x1, x1, s0, x8, r21; nopv - ; CHECK-NEXT: vldb wl4, [p5], #64; nopa ; nops ; nopx ; vshuffle x8, x3, x2, r25; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; add r21, r3, #34; vshift.align x0, x0, s0, x11, r21; nopv + ; CHECK-NEXT: vldb wl7, [p5], #64; vshift.align x2, x2, s0, x7, r21 + ; CHECK-NEXT: vldb wh9, [p5, #32]; mov r3, p4 + ; CHECK-NEXT: vldb wl9, [p5], #64; and r3, r3, r6; add.nc lc, r0, #-1 + ; CHECK-NEXT: vshuffle x6, x3, x2, r10 + ; CHECK-NEXT: vldb wh4, [p5, #32]; vshift.align x1, x1, s0, x8, r21 + ; CHECK-NEXT: vldb wl4, [p5], #64; vshuffle x8, x3, x2, r25 + ; CHECK-NEXT: add r21, r3, #34; vshift.align x0, x0, s0, x11, r21 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldb wh10, [p4, #32]; nopa ; nops ; nopx ; vshuffle x4, x1, x0, r10; vmac.f bmh4, bmh4, x8, x10, r14 - ; CHECK-NEXT: vldb wl10, [p4], m4; nopx ; vshuffle x11, x1, x0, r25; vmac.f bmh2, bmh2, x8, x7, r14 - ; CHECK-NEXT: vldb wh10, [p5, #32]; vshuffle x5, x4, x6, r15; vmac.f bml2, bml2, x8, x9, r14 - ; CHECK-NEXT: vldb wh7, [p4, #32]; vshuffle x6, x4, x6, r24; vmac.f bml1, bml1, x11, x10, r14 - ; CHECK-NEXT: vldb wl7, [p4], m4; vmac.f bmh3, bmh3, x5, x10, r14 - ; CHECK-NEXT: vldb wl10, [p5], #64; vmac.f bml0, bml0, x6, x10, r14 + ; CHECK-NEXT: vldb wl10, [p4], m4; nopa ; nops ; nopx ; vshuffle x11, x1, x0, r25; vmac.f bmh2, bmh2, x8, x7, r14 + ; CHECK-NEXT: vldb wh10, [p5, #32]; nopa ; nops ; nopx ; vshuffle x5, x4, x6, r15; vmac.f bml2, bml2, x8, x9, r14 + ; CHECK-NEXT: vldb wh7, [p4, #32]; nopa ; nops ; nopx ; vshuffle x6, x4, x6, r24; vmac.f bml1, bml1, x11, x10, r14 + ; CHECK-NEXT: vldb wl7, [p4], m4; nopa ; nops ; nopxm ; vmac.f bmh3, bmh3, x5, x10, r14 + ; CHECK-NEXT: vldb wl10, [p5], #64; nopa ; nops ; nopxm ; vmac.f bml0, bml0, x6, x10, r14 ; CHECK-NEXT: vldb wh7, [p5, #32]; vmac.f bmh1, bmh1, x5, x7, r14 ; CHECK-NEXT: vldb wh8, [p4, #32]; vmac.f bmh6, bmh6, x11, x7, r14 ; CHECK-NEXT: vldb wl8, [p4], m4; vshift.align x3, x3, s0, x10, r21; vmac.f bmh5, bmh5, x6, x7, r14 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16.mir index 27ef11c95b80..de5f9dbdd3be 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/conv2d_bf16.mir @@ -25,7 +25,7 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: nopa ; vldb wh7, [p7, #32]; nopx ; mov p4, p2; nops + ; CHECK-NEXT: vldb wh7, [p7, #32]; nopa ; nops ; nopx ; mov p4, p2; nopv ; CHECK-NEXT: vldb wh8, [p0, #32] ; CHECK-NEXT: vldb wl8, [p0], m4; mov p5, p7 ; CHECK-NEXT: vldb wh10, [p0, #32] @@ -34,13 +34,13 @@ ; CHECK-NEXT: vldb wh1, [p0, #32]; add.nc lc, r0, #-1 ; CHECK-NEXT: vldb wl1, [p0], m4; movxm ls, #.LBB0_2 ; CHECK-NEXT: vldb wh3, [p0, #32]; movxm le, #.L_LEnd0 - ; CHECK-NEXT: vldb.3d wl3, [p0], d1; nopa ; nops ; nopx ; vshift.align x0, x0, s0, x8, r3; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov r1, p0; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; and r2, r1, r0; vshift.align x2, x2, s0, x10, r3; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x8, x0, x2, r9; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x5, x0, x2, r25; nopv - ; CHECK-NEXT: nopb ; vlda wh5, [p2, #352]; nops ; nopx ; vshift.align x4, x4, s0, x1, r3; vmac.f bmh7, bmh7, x8, x7, r29 - ; CHECK-NEXT: nopa ; vldb wl5, [p4], #64; nopx ; mov p2, p5 + ; CHECK-NEXT: vldb.3d wl3, [p0], d1; vshift.align x0, x0, s0, x8, r3 + ; CHECK-NEXT: mov r1, p0 + ; CHECK-NEXT: and r2, r1, r0; vshift.align x2, x2, s0, x10, r3 + ; CHECK-NEXT: vshuffle x8, x0, x2, r9 + ; CHECK-NEXT: vshuffle x5, x0, x2, r25 + ; CHECK-NEXT: vlda wh5, [p2, #352]; vshift.align x4, x4, s0, x1, r3; vmac.f bmh7, bmh7, x8, x7, r29 + ; CHECK-NEXT: vldb wl5, [p4], #64; mov p2, p5 ; CHECK-NEXT: vldb wh9, [p4, #32]; add r3, r2, #34; vshift.align x6, x6, s0, x3, r3 ; CHECK-NEXT: vldb wl9, [p4], #64; vshuffle x3, x4, x6, r9 ; CHECK-NEXT: vldb wl11, [p4, #0]; vshuffle x10, x4, x6, r25 @@ -50,11 +50,11 @@ ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldb wh7, [p7, #32]; nopa ; nops ; nopx ; mov p4, p2; vmac.f bmh5, bmh5, x1, x7, r29 - ; CHECK-NEXT: nopa ; vldb wh8, [p0, #32]; nopx ; vmac.f bml2, bml2, x3, x7, r29 - ; CHECK-NEXT: vldb wl8, [p0], m4; mov p5, p7; vmac.f bml4, bml4, x8, x5, r29 - ; CHECK-NEXT: vldb wh10, [p0, #32]; vmac.f bml5, bml5, x10, x5, r29 - ; CHECK-NEXT: vldb wl10, [p0], m4; vmac.f bmh1, bmh1, x8, x9, r29 - ; CHECK-NEXT: vlda wl7, [p7], #256; paddb [p4], #320; vmac.f bmh6, bmh6, x8, x11, r29 + ; CHECK-NEXT: vldb wh8, [p0, #32]; nopa ; nops ; nopxm ; vmac.f bml2, bml2, x3, x7, r29 + ; CHECK-NEXT: vldb wl8, [p0], m4; nopa ; nops ; nopx ; mov p5, p7; vmac.f bml4, bml4, x8, x5, r29 + ; CHECK-NEXT: vldb wh10, [p0, #32]; nopa ; nops ; nopxm ; vmac.f bml5, bml5, x10, x5, r29 + ; CHECK-NEXT: vldb wl10, [p0], m4; nopa ; nops ; nopxm ; vmac.f bmh1, bmh1, x8, x9, r29 + ; CHECK-NEXT: paddb [p4], #320; vlda wl7, [p7], #256; nops ; nopxm ; vmac.f bmh6, bmh6, x8, x11, r29 ; CHECK-NEXT: vldb wh1, [p0, #32]; vmac.f bmh8, bmh8, x10, x11, r29 ; CHECK-NEXT: vldb wl1, [p0], m4; vmac.f bmh0, bmh0, x1, x9, r29 ; CHECK-NEXT: vldb wh3, [p0, #32]; vmac.f bmh3, bmh3, x3, x9, r29 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir index a5dae2d34a2a..202087b62d97 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/crash.mir @@ -17,25 +17,19 @@ ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: nopb ; nopa ; nops ; movxm ls, #.LBB0_1; nopv - ; CHECK-NEXT: mova r0, #8; nopb ; movxm le, #.L_LEnd0 + ; CHECK-NEXT: movxm le, #.L_LEnd0 + ; CHECK-NEXT: mova r0, #8; mov p2, p0 ; CHECK-NEXT: add.nc lc, r0, #0 + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_1: // %for.body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: nopb ; lda r0, [p2, #0]; nops ; nopx ; mov p2, p1; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov p2, p0; nopv - ; CHECK-NEXT: .p2align 4 - ; CHECK-NEXT: .LBB0_1: // %for.body - ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: nopb ; lda r0, [p2, #0]; nops ; nopx ; mov p2, p1; nopv - ; CHECK-NEXT: nopa ; nopx - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: nopb ; nopa ; st r0, [p0, #0]; nopxm ; nopv ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-1.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-1.mir index 34fb40df9b3a..6503f29e931d 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-1.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-1.mir @@ -24,7 +24,7 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl9, [p1], m5; nopx ; CHECK-NEXT: vldb wh9, [p1], m6 ; CHECK-NEXT: vldb wl5, [p1], m5 ; CHECK-NEXT: vldb wh5, [p1], m6 @@ -35,28 +35,28 @@ ; CHECK-NEXT: vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2 ; CHECK-NEXT: vldb wh6, [p0, #96]; padds [p0], m4 ; CHECK-NEXT: padds [p0], #128; vshuffle x5, x5, x5, r2 - ; CHECK-NEXT: vldb wl10, [p0], #32; add.nc lc, r0, #-1 + ; CHECK-NEXT: vldb wl10, [p0], #32 ; CHECK-NEXT: vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2 - ; CHECK-NEXT: vldb wl3, [p0], #32; movxm ls, #.LBB0_2 - ; CHECK-NEXT: vldb.3d wh3, [p0], d0; movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wl3, [p1], m5; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb.3d wh3, [p1], d1; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x1, x8, x10, r4; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x8, x8, x10, r16; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x0, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3 + ; CHECK-NEXT: vldb wl3, [p0], #32 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vldb wl3, [p1], m5; add.nc lc, r0, #-1 + ; CHECK-NEXT: vldb.3d wh3, [p1], d1; movxm ls, #.LBB0_2 + ; CHECK-NEXT: movxm le, #.L_LEnd0 + ; CHECK-NEXT: vshuffle x1, x8, x10, r4 + ; CHECK-NEXT: vshuffle x8, x8, x10, r16 + ; CHECK-NEXT: vshuffle x0, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x8, x11, r3 - ; CHECK-NEXT: vldb wh9, [p1], m6; nopx ; vmac.f bmh2, bmh2, x0, x11, r3 - ; CHECK-NEXT: vldb wl5, [p1], m5; vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 - ; CHECK-NEXT: vldb wh5, [p1], m6; vmac.f bmh4, bmh4, x1, x5, r3 - ; CHECK-NEXT: vldb wl7, [p1], m5; vmac.f bmh5, bmh5, x8, x5, r3 - ; CHECK-NEXT: vldb wh7, [p1], m6; vmac.f bmh6, bmh6, x0, x5, r3 - ; CHECK-NEXT: vldb wl8, [p0, #0]; vmac.f bmh7, bmh7, x6, x5, r3 - ; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bmh8, bmh8, x1, x7, r3 + ; CHECK-NEXT: vldb wh9, [p1], m6; nopa ; nops ; nopxm ; vmac.f bmh2, bmh2, x0, x11, r3 + ; CHECK-NEXT: vldb wl5, [p1], m5; nopa ; nops ; nopx ; vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vldb wh5, [p1], m6; nopa ; nops ; nopxm ; vmac.f bmh4, bmh4, x1, x5, r3 + ; CHECK-NEXT: vldb wl7, [p1], m5; nopa ; nops ; nopxm ; vmac.f bmh5, bmh5, x8, x5, r3 + ; CHECK-NEXT: vldb wh7, [p1], m6; nopa ; nops ; nopxm ; vmac.f bmh6, bmh6, x0, x5, r3 + ; CHECK-NEXT: vldb wl8, [p0, #0]; nopa ; nops ; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: nopa ; vldb wh8, [p0, #32]; nopx ; vmac.f bmh8, bmh8, x1, x7, r3 ; CHECK-NEXT: vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2; vmac.f bml0, bml0, x8, x7, r3 ; CHECK-NEXT: padds [p0], m4; vldb wh6, [p0, #96]; vmac.f bml1, bml1, x0, x7, r3 ; CHECK-NEXT: padds [p0], #128; vshuffle x5, x5, x5, r2; vmac.f bml2, bml2, x6, x7, r3 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-2.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-2.mir index 1e752ef0d90a..10ecee5ea852 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-2.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-2.mir @@ -24,7 +24,7 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: vldb wl9, [p1], m5; nopxm + ; CHECK-NEXT: vldb wl9, [p1], m5 ; CHECK-NEXT: vldb wh9, [p1], m6 ; CHECK-NEXT: vldb wh6, [p0, #96] ; CHECK-NEXT: vldb wh8, [p0, #32] @@ -34,28 +34,28 @@ ; CHECK-NEXT: vldb wl6, [p0, #64]; padds [p0], m4 ; CHECK-NEXT: padds [p0], #128; vldb wl7, [p1], m5; vshuffle x11, x9, x9, r2 ; CHECK-NEXT: vldb wl10, [p0], #32 - ; CHECK-NEXT: vldb wh10, [p0], #32; add.nc lc, r0, #-1 - ; CHECK-NEXT: vldb wl3, [p0], #32; movxm ls, #.LBB0_2 - ; CHECK-NEXT: vldb.3d wh3, [p0], d0; movxm le, #.L_LEnd0 - ; CHECK-NEXT: vldb wh7, [p1], m6; nopa ; nops ; nopx ; vshuffle x5, x5, x5, r2; nopv - ; CHECK-NEXT: vldb wl3, [p1], m5; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb.3d wh3, [p1], d1; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x0, x8, x10, r16; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x1, x8, x10, r4; nopv + ; CHECK-NEXT: vldb wh10, [p0], #32 + ; CHECK-NEXT: vldb wl3, [p0], #32 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0 + ; CHECK-NEXT: vldb wh7, [p1], m6; vshuffle x5, x5, x5, r2 + ; CHECK-NEXT: vldb wl3, [p1], m5; add.nc lc, r0, #-1 + ; CHECK-NEXT: vldb.3d wh3, [p1], d1; movxm ls, #.LBB0_2 + ; CHECK-NEXT: movxm le, #.L_LEnd0 + ; CHECK-NEXT: vshuffle x0, x8, x10, r16 + ; CHECK-NEXT: vshuffle x1, x8, x10, r4 ; CHECK-NEXT: vshuffle x10, x6, x3, r4; vmac.f bmh1, bmh1, x0, x11, r3 ; CHECK-NEXT: vshuffle x6, x6, x3, r16; vmac.f bmh0, bmh0, x1, x11, r3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopx ; vshuffle x7, x7, x7, r2; vmac.f bmh5, bmh5, x0, x5, r3 - ; CHECK-NEXT: nopa ; vldb wh9, [p1], m6; nopx ; vshuffle x3, x3, x3, r2; vmac.f bmh7, bmh7, x6, x5, r3 - ; CHECK-NEXT: vldb wh6, [p0, #96]; vmac.f bml2, bml2, x6, x7, r3 - ; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bml4, bml4, x6, x3, r3 - ; CHECK-NEXT: vldb wl5, [p1], m5; vmac.f bml0, bml0, x0, x7, r3 - ; CHECK-NEXT: vldb wl8, [p0, #0]; vmac.f bml1, bml1, x10, x7, r3 - ; CHECK-NEXT: vldb wh5, [p1], m6; vmac.f bmh6, bmh6, x10, x5, r3 - ; CHECK-NEXT: padds [p0], m4; vldb wl6, [p0, #64]; vmac.f bmh2, bmh2, x10, x11, r3 + ; CHECK-NEXT: vldb wh9, [p1], m6; nopa ; nops ; nopx ; vshuffle x3, x3, x3, r2; vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: vldb wh6, [p0, #96]; nopa ; nops ; nopxm ; vmac.f bml2, bml2, x6, x7, r3 + ; CHECK-NEXT: vldb wh8, [p0, #32]; nopa ; nops ; nopxm ; vmac.f bml4, bml4, x6, x3, r3 + ; CHECK-NEXT: vldb wl5, [p1], m5; nopa ; nops ; nopxm ; vmac.f bml0, bml0, x0, x7, r3 + ; CHECK-NEXT: vldb wl8, [p0, #0]; nopa ; nops ; nopxm ; vmac.f bml1, bml1, x10, x7, r3 + ; CHECK-NEXT: vldb wh5, [p1], m6; nopa ; nops ; nopxm ; vmac.f bmh6, bmh6, x10, x5, r3 + ; CHECK-NEXT: vldb wl6, [p0, #64]; nopa ; padds [p0], m4; nopxm ; vmac.f bmh2, bmh2, x10, x11, r3 ; CHECK-NEXT: padds [p0], #128; vldb wl7, [p1], m5; vshuffle x11, x9, x9, r2; vmac.f bmh3, bmh3, x6, x11, r3 ; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bmh4, bmh4, x1, x5, r3 ; CHECK-NEXT: vldb wh10, [p0], #32; vmac.f bmh8, bmh8, x1, x7, r3 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-3.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-3.mir index 39e4376b0c9c..91419de3ea43 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-3.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-3.mir @@ -34,28 +34,28 @@ ; CHECK-NEXT: vldb wl6, [p0, #64]; padds [p0], m4 ; CHECK-NEXT: padds [p0], #128; vldb wl7, [p1], m5; vshuffle x11, x9, x9, r2 ; CHECK-NEXT: vldb wl10, [p0], #32 - ; CHECK-NEXT: vldb wh10, [p0], #32; add.nc lc, r0, #-1 - ; CHECK-NEXT: vldb wl3, [p0], #32; movxm ls, #.LBB0_2 + ; CHECK-NEXT: vldb wh10, [p0], #32 + ; CHECK-NEXT: vldb wl3, [p0], #32 ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vshuffle x5, x5, x5, r2 - ; CHECK-NEXT: vldb wh7, [p1], m6; movxm le, #.L_LEnd0 - ; CHECK-NEXT: vldb wl3, [p1], m5; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb.3d wh3, [p1], d1; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x2, x2, x10, r16; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x1, x8, x10, r4; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x0, x6, x3, r4; vmac.f bmh5, bmh5, x2, x5, r3 - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh0, bmh0, x1, x11, r3 + ; CHECK-NEXT: vldb wh7, [p1], m6 + ; CHECK-NEXT: vldb wl3, [p1], m5; add.nc lc, r0, #-1 + ; CHECK-NEXT: vldb.3d wh3, [p1], d1; movxm ls, #.LBB0_2 + ; CHECK-NEXT: movxm le, #.L_LEnd0 + ; CHECK-NEXT: vshuffle x2, x2, x10, r16 + ; CHECK-NEXT: vshuffle x1, x8, x10, r4 + ; CHECK-NEXT: vshuffle x0, x6, x3, r4; vmac.f bmh5, bmh5, x2, x5, r3 + ; CHECK-NEXT: vshuffle x6, x6, x3, r16; vmac.f bmh0, bmh0, x1, x11, r3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopx ; vshuffle x7, x7, x7, r2; vmac.f bmh6, bmh6, x0, x5, r3 - ; CHECK-NEXT: nopa ; vldb wh9, [p1], m6; nopx ; vshuffle x3, x3, x3, r2; vmac.f bmh7, bmh7, x6, x5, r3 - ; CHECK-NEXT: vldb wh6, [p0, #96]; vmac.f bml2, bml2, x6, x7, r3 - ; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bml4, bml4, x6, x3, r3 - ; CHECK-NEXT: vldb wl5, [p1], m5; vmac.f bml0, bml0, x2, x7, r3 - ; CHECK-NEXT: vldb wh5, [p1], m6; vmac.f bml1, bml1, x0, x7, r3 - ; CHECK-NEXT: vldb wl8, [p0, #0]; vmac.f bmh1, bmh1, x2, x11, r3 - ; CHECK-NEXT: padds [p0], m4; vldb wl6, [p0, #64]; vmac.f bmh2, bmh2, x0, x11, r3 + ; CHECK-NEXT: vldb wh9, [p1], m6; nopa ; nops ; nopx ; vshuffle x3, x3, x3, r2; vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: vldb wh6, [p0, #96]; nopa ; nops ; nopxm ; vmac.f bml2, bml2, x6, x7, r3 + ; CHECK-NEXT: vldb wh8, [p0, #32]; nopa ; nops ; nopxm ; vmac.f bml4, bml4, x6, x3, r3 + ; CHECK-NEXT: vldb wl5, [p1], m5; nopa ; nops ; nopxm ; vmac.f bml0, bml0, x2, x7, r3 + ; CHECK-NEXT: vldb wh5, [p1], m6; nopa ; nops ; nopxm ; vmac.f bml1, bml1, x0, x7, r3 + ; CHECK-NEXT: vldb wl8, [p0, #0]; nopa ; nops ; nopxm ; vmac.f bmh1, bmh1, x2, x11, r3 + ; CHECK-NEXT: vldb wl6, [p0, #64]; nopa ; padds [p0], m4; nopxm ; vmac.f bmh2, bmh2, x0, x11, r3 ; CHECK-NEXT: padds [p0], #128; vldb wl7, [p1], m5; vshuffle x11, x9, x9, r2; vmac.f bmh3, bmh3, x6, x11, r3 ; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bmh4, bmh4, x1, x5, r3 ; CHECK-NEXT: vldb wh10, [p0], #32; vmac.f bmh8, bmh8, x1, x7, r3 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA-nopstinc.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA-nopstinc.mir index eda9aa247572..40c6c5244d17 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA-nopstinc.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA-nopstinc.mir @@ -26,7 +26,7 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: vldb wl8, [p1], m5; nopxm + ; CHECK-NEXT: vldb wl8, [p1], m5; nopx ; CHECK-NEXT: vldb wh8, [p1], m6 ; CHECK-NEXT: vldb wl0, [p0, #0] ; CHECK-NEXT: vldb wl9, [p1], m5 @@ -41,31 +41,12 @@ ; CHECK-NEXT: vldb wl10, [p1], m5 ; CHECK-NEXT: vldb wh10, [p1], m6 ; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x9, x9, x9, r6 - ; CHECK-NEXT: vldb.3d wh11, [p1], d1; add.nc lc, r0, #-2 + ; CHECK-NEXT: vldb.3d wh11, [p1], d1 ; CHECK-NEXT: vldb wl8, [p1], m5; vshuffle x4, x0, x2, r3 ; CHECK-NEXT: vldb wh8, [p1], m6; vshuffle x5, x0, x2, r16 ; CHECK-NEXT: vldb wl0, [p0, #0]; vshuffle x6, x1, x3, r3; vmac.f bmh0, bmh0, x4, x8, r2 ; CHECK-NEXT: vldb wl9, [p1], m5; vshuffle x7, x1, x3, r16; vmac.f bmh2, bmh2, x5, x8, r2 ; CHECK-NEXT: vldb wh0, [p0, #32]; vshuffle x10, x10, x10, r6; vmac.f bmh1, bmh1, x6, x8, r2 - ; CHECK-NEXT: vldb wl1, [p0, #64]; movxm ls, #.LBB0_2; vmac.f bmh3, bmh3, x7, x8, r2 - ; CHECK-NEXT: padds [p0], m4; vldb wh1, [p0, #96]; vshuffle x11, x11, x11, r6; vmac.f bmh4, bmh0, x4, x9, r2 - ; CHECK-NEXT: padds [p0], #128; vldb wh9, [p1], m6; movxm le, #.L_LEnd0; vmac.f bmh6, bmh2, x5, x9, r2 - ; CHECK-NEXT: vldb wl2, [p0], #32; nopa ; nops ; nopx ; vshuffle x8, x8, x8, r6; vmac.f bmh5, bmh1, x6, x9, r2 - ; CHECK-NEXT: vldb wh2, [p0], #32; nopa ; nops ; nopxm ; vmac.f bmh7, bmh3, x7, x9, r2 - ; CHECK-NEXT: vldb wl3, [p0], #32; nopa ; nops ; nopxm ; vmac.f bmh8, bmh0, x4, x10, r2 - ; CHECK-NEXT: vldb.3d wh3, [p0], d0; nopa ; nops ; nopxm ; vmac.f bml0, bmh1, x6, x10, r2 - ; CHECK-NEXT: vldb wl10, [p1], m5; nopa ; nops ; nopxm ; vmac.f bml1, bmh2, x5, x10, r2 - ; CHECK-NEXT: vldb wh10, [p1], m6; nopa ; nops ; nopxm ; vmac.f bml2, bmh3, x7, x10, r2 - ; CHECK-NEXT: vldb wl11, [p1], m5; nopa ; nops ; nopx ; vshuffle x9, x9, x9, r6; vmac.f bml3, bmh0, x4, x11, r2 - ; CHECK-NEXT: vldb.3d wh11, [p1], d1; nopa ; nops ; nopxm ; vmac.f bml4, bmh1, x6, x11, r2 - ; CHECK-NEXT: .p2align 4 - ; CHECK-NEXT: .LBB0_2: // %for.body - ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: nopa ; vldb wl8, [p1], m5; nopx ; vshuffle x4, x0, x2, r3; vmac.f bml5, bmh2, x5, x11, r2 - ; CHECK-NEXT: vldb wh8, [p1], m6; vshuffle x5, x0, x2, r16; vmac.f bml6, bmh3, x7, x11, r2 - ; CHECK-NEXT: vldb wl0, [p0, #0]; vshuffle x6, x1, x3, r3; vmac.f bmh0, bmh0, x4, x8, r2 - ; CHECK-NEXT: vldb wl9, [p1], m5; vshuffle x7, x1, x3, r16; vmac.f bmh2, bmh2, x5, x8, r2 - ; CHECK-NEXT: vldb wh0, [p0, #32]; vshuffle x10, x10, x10, r6; vmac.f bmh1, bmh1, x6, x8, r2 ; CHECK-NEXT: vldb wl1, [p0, #64]; vmac.f bmh3, bmh3, x7, x8, r2 ; CHECK-NEXT: padds [p0], m4; vldb wh1, [p0, #96]; vshuffle x11, x11, x11, r6; vmac.f bmh4, bmh0, x4, x9, r2 ; CHECK-NEXT: padds [p0], #128; vldb wh9, [p1], m6; vmac.f bmh6, bmh2, x5, x9, r2 @@ -73,6 +54,25 @@ ; CHECK-NEXT: vldb wh2, [p0], #32; vmac.f bmh7, bmh3, x7, x9, r2 ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bmh8, bmh0, x4, x10, r2 ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bml0, bmh1, x6, x10, r2 + ; CHECK-NEXT: vldb wl10, [p1], m5; add.nc lc, r0, #-2; vmac.f bml1, bmh2, x5, x10, r2 + ; CHECK-NEXT: vldb wh10, [p1], m6; movxm ls, #.LBB0_2; vmac.f bml2, bmh3, x7, x10, r2 + ; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x9, x9, x9, r6; vmac.f bml3, bmh0, x4, x11, r2 + ; CHECK-NEXT: vldb.3d wh11, [p1], d1; movxm le, #.L_LEnd0; vmac.f bml4, bmh1, x6, x11, r2 + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_2: // %for.body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: vldb wl8, [p1], m5; nopa ; nops ; nopx ; vshuffle x4, x0, x2, r3; vmac.f bml5, bmh2, x5, x11, r2 + ; CHECK-NEXT: vldb wh8, [p1], m6; nopa ; nops ; nopx ; vshuffle x5, x0, x2, r16; vmac.f bml6, bmh3, x7, x11, r2 + ; CHECK-NEXT: vldb wl0, [p0, #0]; nopa ; nops ; nopx ; vshuffle x6, x1, x3, r3; vmac.f bmh0, bmh0, x4, x8, r2 + ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopx ; vshuffle x7, x1, x3, r16; vmac.f bmh2, bmh2, x5, x8, r2 + ; CHECK-NEXT: vldb wh0, [p0, #32]; nopa ; nops ; nopx ; vshuffle x10, x10, x10, r6; vmac.f bmh1, bmh1, x6, x8, r2 + ; CHECK-NEXT: vldb wl1, [p0, #64]; nopa ; nops ; nopxm ; vmac.f bmh3, bmh3, x7, x8, r2 + ; CHECK-NEXT: vldb wh1, [p0, #96]; nopa ; padds [p0], m4; nopx ; vshuffle x11, x11, x11, r6; vmac.f bmh4, bmh0, x4, x9, r2 + ; CHECK-NEXT: vldb wh9, [p1], m6; nopa ; padds [p0], #128; nopxm ; vmac.f bmh6, bmh2, x5, x9, r2 + ; CHECK-NEXT: nopa ; vldb wl2, [p0], #32; nopx ; vshuffle x8, x8, x8, r6; vmac.f bmh5, bmh1, x6, x9, r2 + ; CHECK-NEXT: vldb wh2, [p0], #32; vmac.f bmh7, bmh3, x7, x9, r2 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bmh8, bmh0, x4, x10, r2 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bml0, bmh1, x6, x10, r2 ; CHECK-NEXT: vldb wl10, [p1], m5; vmac.f bml1, bmh2, x5, x10, r2 ; CHECK-NEXT: vldb wh10, [p1], m6; vmac.f bml2, bmh3, x7, x10, r2 ; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x9, x9, x9, r6; vmac.f bml3, bmh0, x4, x11, r2 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA.mir index ff3118090cd9..120b38fc8a1f 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-feasibleRA.mir @@ -25,8 +25,8 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: vldb wl0, [p0], #32; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wh0, [p0], #32; nopx + ; CHECK-NEXT: vldb wl0, [p0], #32 + ; CHECK-NEXT: vldb wh0, [p0], #32 ; CHECK-NEXT: vldb wl1, [p0], #32 ; CHECK-NEXT: vldb wh1, [p0], #32 ; CHECK-NEXT: vldb wl8, [p1], m5; padds [p0], m4 @@ -46,27 +46,27 @@ ; CHECK-NEXT: vldb wl1, [p0], #32; vshuffle x7, x1, x3, r16 ; CHECK-NEXT: vldb wh1, [p0], #32; vshuffle x9, x9, x9, r6; vmac.f bmh1, bmh1, x6, x8, r2 ; CHECK-NEXT: padds [p0], m4; vldb wl8, [p1], m5; vshuffle x10, x10, x10, r6; vmac.f bmh3, bmh3, x7, x8, r2 - ; CHECK-NEXT: vldb wl2, [p0], #32; add.nc lc, r0, #-2; vmac.f bmh4, bmh0, x4, x9, r2 + ; CHECK-NEXT: vldb wl2, [p0], #32; vmac.f bmh4, bmh0, x4, x9, r2 ; CHECK-NEXT: vldb wh2, [p0], #32; vshuffle x11, x11, x11, r6; vmac.f bmh6, bmh2, x5, x9, r2 - ; CHECK-NEXT: vldb wh8, [p1], m6; movxm ls, #.LBB0_2; vmac.f bmh5, bmh1, x6, x9, r2 - ; CHECK-NEXT: vldb wl3, [p0], #32; movxm le, #.L_LEnd0; vmac.f bmh7, bmh3, x7, x9, r2 - ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopxm ; vmac.f bmh8, bmh0, x4, x10, r2 - ; CHECK-NEXT: vldb.3d wh3, [p0], d0; nopa ; nops ; nopxm ; vmac.f bml0, bmh1, x6, x10, r2 - ; CHECK-NEXT: vldb wh9, [p1], m6; nopa ; nops ; nopxm ; vmac.f bml1, bmh2, x5, x10, r2 - ; CHECK-NEXT: vldb wl10, [p1], m5; nopa ; nops ; nopxm ; vmac.f bml2, bmh3, x7, x10, r2 - ; CHECK-NEXT: vldb wh10, [p1], m6; nopa ; nops ; nopx ; vshuffle x4, x0, x2, r3; vmac.f bml3, bmh0, x4, x11, r2 - ; CHECK-NEXT: vldb wl11, [p1], m5; nopa ; nops ; nopx ; vshuffle x8, x8, x8, r6; vmac.f bml4, bmh1, x6, x11, r2 - ; CHECK-NEXT: vldb.3d wh11, [p1], d1; nopa ; nops ; nopx ; vshuffle x5, x0, x2, r16; vmac.f bml5, bmh2, x5, x11, r2 + ; CHECK-NEXT: vldb wh8, [p1], m6; vmac.f bmh5, bmh1, x6, x9, r2 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bmh7, bmh3, x7, x9, r2 + ; CHECK-NEXT: vldb wl9, [p1], m5; vmac.f bmh8, bmh0, x4, x10, r2 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; add.nc lc, r0, #-2; vmac.f bml0, bmh1, x6, x10, r2 + ; CHECK-NEXT: vldb wh9, [p1], m6; movxm ls, #.LBB0_2; vmac.f bml1, bmh2, x5, x10, r2 + ; CHECK-NEXT: vldb wl10, [p1], m5; movxm le, #.L_LEnd0; vmac.f bml2, bmh3, x7, x10, r2 + ; CHECK-NEXT: vldb wh10, [p1], m6; vshuffle x4, x0, x2, r3; vmac.f bml3, bmh0, x4, x11, r2 + ; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x8, x8, x8, r6; vmac.f bml4, bmh1, x6, x11, r2 + ; CHECK-NEXT: vldb.3d wh11, [p1], d1; vshuffle x5, x0, x2, r16; vmac.f bml5, bmh2, x5, x11, r2 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldb wl0, [p0], #32; nopa ; nops ; nopxm ; vmac.f bmh0, bmh0, x4, x8, r2 - ; CHECK-NEXT: vldb wh0, [p0], #32; vshuffle x6, x1, x3, r3; vmac.f bmh2, bmh2, x5, x8, r2 - ; CHECK-NEXT: vldb wl1, [p0], #32; vshuffle x7, x1, x3, r16; vmac.f bml6, bmh3, x7, x11, r2 - ; CHECK-NEXT: vldb wh1, [p0], #32; vshuffle x9, x9, x9, r6; vmac.f bmh1, bmh1, x6, x8, r2 - ; CHECK-NEXT: padds [p0], m4; vldb wl8, [p1], m5; vshuffle x10, x10, x10, r6; vmac.f bmh3, bmh3, x7, x8, r2 - ; CHECK-NEXT: vldb wl2, [p0], #32; vmac.f bmh4, bmh0, x4, x9, r2 - ; CHECK-NEXT: vldb wh2, [p0], #32; vshuffle x11, x11, x11, r6; vmac.f bmh6, bmh2, x5, x9, r2 + ; CHECK-NEXT: vldb wh0, [p0], #32; nopa ; nops ; nopx ; vshuffle x6, x1, x3, r3; vmac.f bmh2, bmh2, x5, x8, r2 + ; CHECK-NEXT: vldb wl1, [p0], #32; nopa ; nops ; nopx ; vshuffle x7, x1, x3, r16; vmac.f bml6, bmh3, x7, x11, r2 + ; CHECK-NEXT: vldb wh1, [p0], #32; nopa ; nops ; nopx ; vshuffle x9, x9, x9, r6; vmac.f bmh1, bmh1, x6, x8, r2 + ; CHECK-NEXT: vldb wl8, [p1], m5; nopa ; padds [p0], m4; nopx ; vshuffle x10, x10, x10, r6; vmac.f bmh3, bmh3, x7, x8, r2 + ; CHECK-NEXT: vldb wl2, [p0], #32; nopa ; nops ; nopxm ; vmac.f bmh4, bmh0, x4, x9, r2 + ; CHECK-NEXT: vldb wh2, [p0], #32; nopx ; vshuffle x11, x11, x11, r6; vmac.f bmh6, bmh2, x5, x9, r2 ; CHECK-NEXT: vldb wh8, [p1], m6; vmac.f bmh5, bmh1, x6, x9, r2 ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bmh7, bmh3, x7, x9, r2 ; CHECK-NEXT: vldb wl9, [p1], m5; vmac.f bmh8, bmh0, x4, x10, r2 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nooffset.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nooffset.mir index f0c39d42e26d..c6586063b0bb 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nooffset.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nooffset.mir @@ -24,8 +24,8 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: nopa ; vldb wl8, [p0], #32; nopxm ; nops - ; CHECK-NEXT: vldb wh8, [p0], #32 + ; CHECK-NEXT: vldb wl8, [p0], #32; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wh8, [p0], #32; nopx ; CHECK-NEXT: vldb wl11, [p1], m5 ; CHECK-NEXT: vldb wl1, [p0], #32 ; CHECK-NEXT: vldb wh1, [p0], #32 @@ -48,27 +48,27 @@ ; CHECK-NEXT: vldb wh1, [p0], #32; vshuffle x3, x11, x11, r2; vmac.f bmh6, bmh6, x10, x5, r3 ; CHECK-NEXT: paddb [p0], m4; vshuffle x0, x0, x0, r2; vmac.f bmh7, bmh7, x9, x5, r3 ; CHECK-NEXT: vldb wh11, [p1], m6; vshuffle x7, x7, x7, r2; vmac.f bmh0, bmh0, x6, x3, r3 - ; CHECK-NEXT: vldb wl0, [p0], #32; add.nc lc, r0, #-2; vmac.f bmh1, bmh1, x2, x3, r3 - ; CHECK-NEXT: vldb wh0, [p0], #32; movxm ls, #.LBB0_2; vmac.f bmh2, bmh2, x10, x3, r3 - ; CHECK-NEXT: vldb wl5, [p1], m5; movxm le, #.L_LEnd0; vmac.f bmh3, bmh3, x9, x3, r3 - ; CHECK-NEXT: nopb ; vlda wh5, [p1], m6; nops ; nopxm ; vmac.f bmh8, bmh8, x6, x0, r3 - ; CHECK-NEXT: vldb wl3, [p0], #32; nopa ; nops ; nopxm ; vmac.f bml0, bml0, x2, x0, r3 - ; CHECK-NEXT: vldb.3d wh3, [p0], d0; nopa ; nops ; nopxm ; vmac.f bml1, bml1, x10, x0, r3 - ; CHECK-NEXT: vldb wl0, [p1], m5; nopa ; nops ; nopxm ; vmac.f bml2, bml2, x9, x0, r3 - ; CHECK-NEXT: vldb wh0, [p1], m6; nopa ; nops ; nopxm ; vmac.f bml3, bml3, x6, x7, r3 - ; CHECK-NEXT: vldb wl7, [p1], m5; nopa ; nops ; nopx ; vshuffle x6, x8, x0, r4; vmac.f bml5, bml5, x2, x7, r3 - ; CHECK-NEXT: vldb.3d wh7, [p1], d1; nopa ; nops ; nopx ; vshuffle x2, x8, x0, r16; vmac.f bml6, bml6, x10, x7, r3 + ; CHECK-NEXT: vldb wl0, [p0], #32; vmac.f bmh1, bmh1, x2, x3, r3 + ; CHECK-NEXT: vldb wh0, [p0], #32; vmac.f bmh2, bmh2, x10, x3, r3 + ; CHECK-NEXT: vldb wl5, [p1], m5; vmac.f bmh3, bmh3, x9, x3, r3 + ; CHECK-NEXT: vlda wh5, [p1], m6; vmac.f bmh8, bmh8, x6, x0, r3 + ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bml0, bml0, x2, x0, r3 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0; add.nc lc, r0, #-2; vmac.f bml1, bml1, x10, x0, r3 + ; CHECK-NEXT: vldb wl0, [p1], m5; movxm ls, #.LBB0_2; vmac.f bml2, bml2, x9, x0, r3 + ; CHECK-NEXT: vldb wh0, [p1], m6; movxm le, #.L_LEnd0; vmac.f bml3, bml3, x6, x7, r3 + ; CHECK-NEXT: vldb wl7, [p1], m5; vshuffle x6, x8, x0, r4; vmac.f bml5, bml5, x2, x7, r3 + ; CHECK-NEXT: vldb.3d wh7, [p1], d1; vshuffle x2, x8, x0, r16; vmac.f bml6, bml6, x10, x7, r3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldb wl8, [p0], #32; nopa ; nops ; nopx ; vshuffle x5, x5, x5, r2; vmac.f bml4, bml4, x9, x7, r3 - ; CHECK-NEXT: vldb wh8, [p0], #32 - ; CHECK-NEXT: vldb wl11, [p1], m5; vshuffle x10, x1, x3, r4; vmac.f bmh4, bmh4, x6, x5, r3 - ; CHECK-NEXT: vldb wl1, [p0], #32; vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r3 - ; CHECK-NEXT: vldb wh1, [p0], #32; vshuffle x3, x11, x11, r2; vmac.f bmh6, bmh6, x10, x5, r3 - ; CHECK-NEXT: paddb [p0], m4; vshuffle x0, x0, x0, r2; vmac.f bmh7, bmh7, x9, x5, r3 - ; CHECK-NEXT: vldb wh11, [p1], m6; vshuffle x7, x7, x7, r2; vmac.f bmh0, bmh0, x6, x3, r3 - ; CHECK-NEXT: vldb wl0, [p0], #32; vmac.f bmh1, bmh1, x2, x3, r3 + ; CHECK-NEXT: vldb wh8, [p0], #32; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl11, [p1], m5; nopa ; nops ; nopx ; vshuffle x10, x1, x3, r4; vmac.f bmh4, bmh4, x6, x5, r3 + ; CHECK-NEXT: vldb wl1, [p0], #32; nopa ; nops ; nopx ; vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r3 + ; CHECK-NEXT: vldb wh1, [p0], #32; nopa ; nops ; nopx ; vshuffle x3, x11, x11, r2; vmac.f bmh6, bmh6, x10, x5, r3 + ; CHECK-NEXT: paddb [p0], m4; nopa ; nops ; nopx ; vshuffle x0, x0, x0, r2; vmac.f bmh7, bmh7, x9, x5, r3 + ; CHECK-NEXT: vldb wh11, [p1], m6; nopa ; nops ; nopx ; vshuffle x7, x7, x7, r2; vmac.f bmh0, bmh0, x6, x3, r3 + ; CHECK-NEXT: nopa ; vldb wl0, [p0], #32; nopxm ; vmac.f bmh1, bmh1, x2, x3, r3 ; CHECK-NEXT: vldb wh0, [p0], #32; vmac.f bmh2, bmh2, x10, x3, r3 ; CHECK-NEXT: vldb wl5, [p1], m5; vmac.f bmh3, bmh3, x9, x3, r3 ; CHECK-NEXT: vlda wh5, [p1], m6; vmac.f bmh8, bmh8, x6, x0, r3 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched-nobanks.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched-nobanks.mir index c72293a8eb09..6d9fe93630eb 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched-nobanks.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched-nobanks.mir @@ -25,7 +25,7 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: nopa ; vldb wl8, [p0, #0]; nopxm + ; CHECK-NEXT: vldb wl8, [p0, #0]; nopx ; CHECK-NEXT: vldb wh8, [p0, #32] ; CHECK-NEXT: vldb wl1, [p0, #64] ; CHECK-NEXT: vlda wh1, [p0, #96]; paddb [p0], m4 @@ -33,26 +33,26 @@ ; CHECK-NEXT: vldb wl10, [p0], #32 ; CHECK-NEXT: vldb wh10, [p0], #32 ; CHECK-NEXT: vldb wl3, [p0], #32 - ; CHECK-NEXT: vldb.3d wh3, [p0], d0; add.nc lc, r0, #-1 - ; CHECK-NEXT: movxm ls, #.LBB0_2 - ; CHECK-NEXT: vldb wl3, [p1], m5; movxm le, #.L_LEnd0 - ; CHECK-NEXT: vldb wh3, [p1], m6; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wl5, [p1], m5; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wh5, [p1], m6; nopa ; nops ; nopx ; vshuffle x6, x8, x10, r3; nopv - ; CHECK-NEXT: vldb wl0, [p1], m5; nopa ; nops ; nopx ; vshuffle x11, x8, x10, r16; nopv - ; CHECK-NEXT: vldb wh0, [p1], m6; nopa ; nops ; nopx ; vshuffle x0, x1, x3, r3; nopv - ; CHECK-NEXT: vldb wl7, [p1], m5; nopa ; nops ; nopx ; vshuffle x9, x1, x3, r16; nopv - ; CHECK-NEXT: vldb.3d wh7, [p1], d1; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb.3d wh3, [p0], d0 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vldb wl3, [p1], m5 + ; CHECK-NEXT: vldb wh3, [p1], m6; add.nc lc, r0, #-1 + ; CHECK-NEXT: vldb wl5, [p1], m5; movxm ls, #.LBB0_2 + ; CHECK-NEXT: vldb wh5, [p1], m6; vshuffle x6, x8, x10, r3 + ; CHECK-NEXT: vldb wl0, [p1], m5; vshuffle x11, x8, x10, r16 + ; CHECK-NEXT: vldb wh0, [p1], m6; vshuffle x0, x1, x3, r3 + ; CHECK-NEXT: vldb wl7, [p1], m5; vshuffle x9, x1, x3, r16 + ; CHECK-NEXT: vldb.3d wh7, [p1], d1; movxm le, #.L_LEnd0 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: vldb wl8, [p0, #0]; vshuffle x3, x3, x3, r6 - ; CHECK-NEXT: vldb wh8, [p0, #32]; nopx - ; CHECK-NEXT: vldb wl1, [p0, #64]; vshuffle x5, x5, x5, r6; vmac.f bmh0, bmh0, x6, x3, r2 - ; CHECK-NEXT: vlda wh1, [p0, #96]; paddb [p0], m4; vmac.f bmh1, bmh1, x11, x3, r2 - ; CHECK-NEXT: paddb [p0], #128; vshuffle x0, x0, x0, r6; vmac.f bmh3, bmh3, x9, x3, r2 - ; CHECK-NEXT: vldb wl10, [p0], #32; vmac.f bmh4, bmh4, x6, x5, r2 - ; CHECK-NEXT: vldb wh10, [p0], #32; vshuffle x7, x7, x7, r6; vmac.f bmh2, bmh2, x0, x3, r2 + ; CHECK-NEXT: vldb wl8, [p0, #0]; nopa ; nops ; nopx ; vshuffle x3, x3, x3, r6; nopv + ; CHECK-NEXT: vldb wh8, [p0, #32]; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl1, [p0, #64]; nopa ; nops ; nopx ; vshuffle x5, x5, x5, r6; vmac.f bmh0, bmh0, x6, x3, r2 + ; CHECK-NEXT: paddb [p0], m4; vlda wh1, [p0, #96]; nops ; nopxm ; vmac.f bmh1, bmh1, x11, x3, r2 + ; CHECK-NEXT: paddb [p0], #128; nopa ; nops ; nopx ; vshuffle x0, x0, x0, r6; vmac.f bmh3, bmh3, x9, x3, r2 + ; CHECK-NEXT: vldb wl10, [p0], #32; nopa ; nops ; nopxm ; vmac.f bmh4, bmh4, x6, x5, r2 + ; CHECK-NEXT: vldb wh10, [p0], #32; nopx ; vshuffle x7, x7, x7, r6; vmac.f bmh2, bmh2, x0, x3, r2 ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bmh5, bmh5, x11, x5, r2 ; CHECK-NEXT: vldb.3d wh3, [p0], d0; vmac.f bmh6, bmh6, x0, x5, r2 ; CHECK-NEXT: vmac.f bmh7, bmh7, x9, x5, r2 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched.mir index 3ac6b465b7ad..7f26d21fa3f2 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-nopresched.mir @@ -25,32 +25,32 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: vlda wl11, [p1], m5; vldb wl8, [p0], #32; nopm + ; CHECK-NEXT: vlda wl11, [p1], m5; vldb wl8, [p0], #32; nopxm ; CHECK-NEXT: vlda wh11, [p1], m6; vldb wh8, [p0], #32 ; CHECK-NEXT: vlda wl5, [p1], m5; vldb wl1, [p0], #32 ; CHECK-NEXT: vlda wh5, [p1], m6; vldb wh1, [p0], #32 ; CHECK-NEXT: paddb [p0], m4 ; CHECK-NEXT: vldb wl0, [p0], #32 - ; CHECK-NEXT: vldb wh0, [p0], #32; add.nc lc, r0, #-1 - ; CHECK-NEXT: vldb wl3, [p0], #32; movxm ls, #.LBB0_2 - ; CHECK-NEXT: vlda wl0, [p1], m5; vldb.3d wh3, [p0], d0; movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopb ; vlda wh0, [p1], m6; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; vlda wl7, [p1], m5; nops ; nopx ; vshuffle x5, x5, x5, r6; nopv - ; CHECK-NEXT: nopb ; vlda.3d wh7, [p1], d1; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x6, x8, x0, r3; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x2, x8, x0, r16; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x10, x1, x3, r3; vmac.f bmh4, bmh4, x6, x5, r2 + ; CHECK-NEXT: vldb wh0, [p0], #32 + ; CHECK-NEXT: vldb wl3, [p0], #32 + ; CHECK-NEXT: vlda wl0, [p1], m5; vldb.3d wh3, [p0], d0 + ; CHECK-NEXT: vlda wh0, [p1], m6; add.nc lc, r0, #-1 + ; CHECK-NEXT: vlda wl7, [p1], m5; vshuffle x5, x5, x5, r6 + ; CHECK-NEXT: vlda.3d wh7, [p1], d1; movxm ls, #.LBB0_2 + ; CHECK-NEXT: movxm le, #.L_LEnd0 + ; CHECK-NEXT: vshuffle x6, x8, x0, r3 + ; CHECK-NEXT: vshuffle x2, x8, x0, r16 + ; CHECK-NEXT: vshuffle x10, x1, x3, r3; vmac.f bmh4, bmh4, x6, x5, r2 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldb wl8, [p0], #32; vlda wl11, [p1], m5; nops ; nopx ; vshuffle x9, x1, x3, r16; vmac.f bmh5, bmh5, x2, x5, r2 ; CHECK-NEXT: vldb wh8, [p0], #32; vlda wh11, [p1], m6; nops ; nopx ; vshuffle x3, x11, x11, r6; vmac.f bmh6, bmh6, x10, x5, r2 - ; CHECK-NEXT: vlda wl5, [p1], m5; vldb wl1, [p0], #32; vshuffle x0, x0, x0, r6; vmac.f bmh7, bmh7, x9, x5, r2 - ; CHECK-NEXT: vlda wh5, [p1], m6; vldb wh1, [p0], #32; vshuffle x7, x7, x7, r6; vmac.f bmh0, bmh0, x6, x3, r2 - ; CHECK-NEXT: paddb [p0], m4; vmac.f bmh1, bmh1, x2, x3, r2 - ; CHECK-NEXT: vldb wl0, [p0], #32; vmac.f bmh2, bmh2, x10, x3, r2 - ; CHECK-NEXT: vldb wh0, [p0], #32; vmac.f bmh3, bmh3, x9, x3, r2 + ; CHECK-NEXT: vldb wl1, [p0], #32; vlda wl5, [p1], m5; nops ; nopx ; vshuffle x0, x0, x0, r6; vmac.f bmh7, bmh7, x9, x5, r2 + ; CHECK-NEXT: vldb wh1, [p0], #32; vlda wh5, [p1], m6; nops ; nopx ; vshuffle x7, x7, x7, r6; vmac.f bmh0, bmh0, x6, x3, r2 + ; CHECK-NEXT: paddb [p0], m4; nopa ; nops ; nopxm ; vmac.f bmh1, bmh1, x2, x3, r2 + ; CHECK-NEXT: vldb wl0, [p0], #32; nopa ; nops ; nopxm ; vmac.f bmh2, bmh2, x10, x3, r2 + ; CHECK-NEXT: vldb wh0, [p0], #32; nopa ; nops ; nopxm ; vmac.f bmh3, bmh3, x9, x3, r2 ; CHECK-NEXT: vldb wl3, [p0], #32; vmac.f bmh8, bmh8, x6, x0, r2 ; CHECK-NEXT: vlda wl0, [p1], m5; vldb.3d wh3, [p0], d0; vmac.f bml0, bml0, x2, x0, r2 ; CHECK-NEXT: vlda wh0, [p1], m6; vmac.f bml1, bml1, x10, x0, r2 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-waw.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-waw.mir index 9d785e292122..216e4356fdb0 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-waw.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm-waw.mir @@ -26,7 +26,7 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl9, [p1], m5; nopx ; CHECK-NEXT: vldb wh9, [p1], m6 ; CHECK-NEXT: vldb wl5, [p1], m5 ; CHECK-NEXT: vldb wh5, [p1], m6 @@ -37,28 +37,28 @@ ; CHECK-NEXT: vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2 ; CHECK-NEXT: vldb wh6, [p0, #96]; padds [p0], m4 ; CHECK-NEXT: padds [p0], #128; vshuffle x5, x5, x5, r2 - ; CHECK-NEXT: vldb wl10, [p0], #32; add.nc lc, r0, #-1 + ; CHECK-NEXT: vldb wl10, [p0], #32 ; CHECK-NEXT: vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2 - ; CHECK-NEXT: vldb wl3, [p0], #32; movxm ls, #.LBB0_2 - ; CHECK-NEXT: vldb.3d wh3, [p0], d0; movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wl3, [p1], m5; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb.3d wh3, [p1], d1; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x1, x8, x10, r4; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x8, x8, x10, r16; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x10, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3 + ; CHECK-NEXT: vldb wl3, [p0], #32 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vldb wl3, [p1], m5; add.nc lc, r0, #-1 + ; CHECK-NEXT: vldb.3d wh3, [p1], d1; movxm ls, #.LBB0_2 + ; CHECK-NEXT: movxm le, #.L_LEnd0 + ; CHECK-NEXT: vshuffle x1, x8, x10, r4 + ; CHECK-NEXT: vshuffle x8, x8, x10, r16 + ; CHECK-NEXT: vshuffle x10, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x8, x11, r3 - ; CHECK-NEXT: vldb wh9, [p1], m6; nopx ; vmac.f bmh2, bmh2, x10, x11, r3 - ; CHECK-NEXT: vldb wl5, [p1], m5; vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 - ; CHECK-NEXT: vldb wh5, [p1], m6; vmac.f bmh4, bmh4, x1, x5, r3 - ; CHECK-NEXT: vldb wl7, [p1], m5; vmac.f bmh5, bmh5, x8, x5, r3 - ; CHECK-NEXT: vldb wh7, [p1], m6; vmac.f bmh6, bmh6, x10, x5, r3 - ; CHECK-NEXT: vldb wl8, [p0, #0]; vmac.f bmh7, bmh7, x6, x5, r3 - ; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bmh8, bmh8, x1, x7, r3 + ; CHECK-NEXT: vldb wh9, [p1], m6; nopa ; nops ; nopxm ; vmac.f bmh2, bmh2, x10, x11, r3 + ; CHECK-NEXT: vldb wl5, [p1], m5; nopa ; nops ; nopx ; vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vldb wh5, [p1], m6; nopa ; nops ; nopxm ; vmac.f bmh4, bmh4, x1, x5, r3 + ; CHECK-NEXT: vldb wl7, [p1], m5; nopa ; nops ; nopxm ; vmac.f bmh5, bmh5, x8, x5, r3 + ; CHECK-NEXT: vldb wh7, [p1], m6; nopa ; nops ; nopxm ; vmac.f bmh6, bmh6, x10, x5, r3 + ; CHECK-NEXT: vldb wl8, [p0, #0]; nopa ; nops ; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: nopa ; vldb wh8, [p0, #32]; nopx ; vmac.f bmh8, bmh8, x1, x7, r3 ; CHECK-NEXT: vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2; vmac.f bml0, bml0, x8, x7, r3 ; CHECK-NEXT: padds [p0], m4; vldb wh6, [p0, #96]; vmac.f bml1, bml1, x10, x7, r3 ; CHECK-NEXT: padds [p0], #128; vshuffle x5, x5, x5, r2; vmac.f bml2, bml2, x6, x7, r3 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm.mir index 800e684c08b7..ba0916b41f66 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/gemm.mir @@ -24,7 +24,7 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: vldb wl9, [p1], m5; nopx ; CHECK-NEXT: vldb wh9, [p1], m6 ; CHECK-NEXT: vldb wl5, [p1], m5 ; CHECK-NEXT: vldb wh5, [p1], m6 @@ -35,28 +35,28 @@ ; CHECK-NEXT: vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2 ; CHECK-NEXT: vldb wh6, [p0, #96]; padds [p0], m4 ; CHECK-NEXT: padds [p0], #128; vshuffle x5, x5, x5, r2 - ; CHECK-NEXT: vldb wl10, [p0], #32; add.nc lc, r0, #-1 + ; CHECK-NEXT: vldb wl10, [p0], #32 ; CHECK-NEXT: vldb wh10, [p0], #32; vshuffle x7, x7, x7, r2 - ; CHECK-NEXT: vldb wl3, [p0], #32; movxm ls, #.LBB0_2 - ; CHECK-NEXT: vldb.3d wh3, [p0], d0; movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb wl3, [p1], m5; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb.3d wh3, [p1], d1; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x1, x8, x10, r4; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x8, x8, x10, r16; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vshuffle x10, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3 + ; CHECK-NEXT: vldb wl3, [p0], #32 + ; CHECK-NEXT: vldb.3d wh3, [p0], d0 + ; CHECK-NEXT: nop + ; CHECK-NEXT: vldb wl3, [p1], m5; add.nc lc, r0, #-1 + ; CHECK-NEXT: vldb.3d wh3, [p1], d1; movxm ls, #.LBB0_2 + ; CHECK-NEXT: movxm le, #.L_LEnd0 + ; CHECK-NEXT: vshuffle x1, x8, x10, r4 + ; CHECK-NEXT: vshuffle x8, x8, x10, r16 + ; CHECK-NEXT: vshuffle x10, x6, x3, r4; vmac.f bmh0, bmh0, x1, x11, r3 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldb wl9, [p1], m5; nopa ; nops ; nopx ; vshuffle x6, x6, x3, r16; vmac.f bmh1, bmh1, x8, x11, r3 - ; CHECK-NEXT: vldb wh9, [p1], m6; nopx ; vmac.f bmh2, bmh2, x10, x11, r3 - ; CHECK-NEXT: vldb wl5, [p1], m5; vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 - ; CHECK-NEXT: vldb wh5, [p1], m6; vmac.f bmh4, bmh4, x1, x5, r3 - ; CHECK-NEXT: vldb wl7, [p1], m5; vmac.f bmh5, bmh5, x8, x5, r3 - ; CHECK-NEXT: vldb wh7, [p1], m6; vmac.f bmh6, bmh6, x10, x5, r3 - ; CHECK-NEXT: vldb wl8, [p0, #0]; vmac.f bmh7, bmh7, x6, x5, r3 - ; CHECK-NEXT: vldb wh8, [p0, #32]; vmac.f bmh8, bmh8, x1, x7, r3 + ; CHECK-NEXT: vldb wh9, [p1], m6; nopa ; nops ; nopxm ; vmac.f bmh2, bmh2, x10, x11, r3 + ; CHECK-NEXT: vldb wl5, [p1], m5; nopa ; nops ; nopx ; vshuffle x3, x3, x3, r2; vmac.f bmh3, bmh3, x6, x11, r3 + ; CHECK-NEXT: vldb wh5, [p1], m6; nopa ; nops ; nopxm ; vmac.f bmh4, bmh4, x1, x5, r3 + ; CHECK-NEXT: vldb wl7, [p1], m5; nopa ; nops ; nopxm ; vmac.f bmh5, bmh5, x8, x5, r3 + ; CHECK-NEXT: vldb wh7, [p1], m6; nopa ; nops ; nopxm ; vmac.f bmh6, bmh6, x10, x5, r3 + ; CHECK-NEXT: vldb wl8, [p0, #0]; nopa ; nops ; nopxm ; vmac.f bmh7, bmh7, x6, x5, r3 + ; CHECK-NEXT: nopa ; vldb wh8, [p0, #32]; nopx ; vmac.f bmh8, bmh8, x1, x7, r3 ; CHECK-NEXT: vldb wl6, [p0, #64]; vshuffle x11, x9, x9, r2; vmac.f bml0, bml0, x8, x7, r3 ; CHECK-NEXT: padds [p0], m4; vldb wh6, [p0, #96]; vmac.f bml1, bml1, x10, x7, r3 ; CHECK-NEXT: padds [p0], #128; vshuffle x5, x5, x5, r2; vmac.f bml2, bml2, x6, x7, r3 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/hardsigmoid-templated-double.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/hardsigmoid-templated-double.mir index b5ea91baa30e..38c681b4a8b1 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/hardsigmoid-templated-double.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/hardsigmoid-templated-double.mir @@ -28,8 +28,8 @@ ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmin_ge.s16 x8, r16, x6, x0; nopv ; CHECK-NEXT: vldb.unpack.s16.s8 x6, [p0], m0; nopa ; nops ; nopx ; vmax_lt.s16 x10, r16, x8, x2; vmac cm1, cm0, x10, x4, r0 ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmin_ge.s16 x8, r16, x6, x0; nopv - ; CHECK-NEXT: vldb.unpack.s16.s8 x6, [p0], m0; nopa ; nops ; nopx ; vmax_lt.s16 x10, r16, x8, x2; vmac cm1, cm0, x10, x4, r0 - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmin_ge.s16 x8, r16, x6, x0; nopv + ; CHECK-NEXT: vldb.unpack.s16.s8 x6, [p0], m0; nopx ; vmax_lt.s16 x10, r16, x8, x2; vmac cm1, cm0, x10, x4, r0 + ; CHECK-NEXT: vmin_ge.s16 x8, r16, x6, x0 ; CHECK-NEXT: .LBB0_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldb.unpack.s16.s8 x6, [p0], m0; nopa ; nops ; nopx ; vmax_lt.s16 x10, r16, x8, x2; vmac cm1, cm0, x10, x4, r0 @@ -52,8 +52,8 @@ ; CHECK-NEXT: nopb ; nopa ; vst.srs.s8.s32 cm1, s0, [p1], #32; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vldb.unpack.s16.s8 x6, [p0], m0; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopa ; nopx + ; CHECK-NEXT: vldb.unpack.s16.s8 x6, [p0], m0 + ; CHECK-NEXT: nop ; CHECK-NEXT: vldb.unpack.s16.s8 x6, [p0], m0 ; CHECK-NEXT: nop ; CHECK-NEXT: vldb.unpack.s16.s8 x6, [p0], m0 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/hardsigmoid-templated.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/hardsigmoid-templated.mir index 781f7ecb1bad..25354709ab4d 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/hardsigmoid-templated.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/hardsigmoid-templated.mir @@ -25,8 +25,8 @@ ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmin_ge.s16 x8, r16, x6, x0; nopv ; CHECK-NEXT: vldb.unpack.s16.s8 x6, [p0], m0; nopa ; nops ; nopx ; vmax_lt.s16 x10, r16, x8, x2; vmac cm1, cm0, x10, x4, r0 ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmin_ge.s16 x8, r16, x6, x0; nopv - ; CHECK-NEXT: vldb.unpack.s16.s8 x6, [p0], m0; nopa ; nops ; nopx ; vmax_lt.s16 x10, r16, x8, x2; vmac cm1, cm0, x10, x4, r0 - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmin_ge.s16 x8, r16, x6, x0; nopv + ; CHECK-NEXT: vldb.unpack.s16.s8 x6, [p0], m0; nopx ; vmax_lt.s16 x10, r16, x8, x2; vmac cm1, cm0, x10, x4, r0 + ; CHECK-NEXT: vmin_ge.s16 x8, r16, x6, x0 ; CHECK-NEXT: .LBB0_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldb.unpack.s16.s8 x6, [p0], m0; nopa ; nops ; nopx ; vmax_lt.s16 x10, r16, x8, x2; vmac cm1, cm0, x10, x4, r0 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/interleave-prologue.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/interleave-prologue.mir index 9b1be7c00997..b989e765b51a 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/interleave-prologue.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/interleave-prologue.mir @@ -320,19 +320,19 @@ body: | ; CHECK-NEXT: $r8 = LDA_dms_lda_idx_imm killed $p3, 0 ; CHECK-NEXT: $bmh2 = VLDA_UPS_S32_S16_ag_idx_imm $s0, $p2, 32, implicit-def $srups_of, implicit $crsat ; CHECK-NEXT: $bml2 = VLDA_UPS_S32_S16_ag_idx_imm $s0, killed $p2, 0, implicit-def $srups_of, implicit $crsat - ; CHECK-NEXT: BUNDLE implicit-def $wh6, implicit-def $lc, implicit $p0, implicit killed $r0 { - ; CHECK-NEXT: $wh6 = VLDA_dmw_lda_w_ag_idx_imm $p0, 32 + ; CHECK-NEXT: $wh6 = VLDA_dmw_lda_w_ag_idx_imm $p0, 32 + ; CHECK-NEXT: BUNDLE implicit-def $wl6, implicit-def $p0, implicit-def $lc, implicit killed $p0, implicit $m0, implicit killed $r0 { + ; CHECK-NEXT: $wl6, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm killed $p0, $m0 ; CHECK-NEXT: $lc = ADD_NC killed $r0, -5 ; CHECK-NEXT: } - ; CHECK-NEXT: BUNDLE implicit-def $wl6, implicit-def $p0, implicit-def $ls, implicit killed $p0, implicit $m0 { - ; CHECK-NEXT: $wl6, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm killed $p0, $m0 + ; CHECK-NEXT: BUNDLE implicit-def $wh6, implicit-def $ls, implicit $p0 { + ; CHECK-NEXT: $wh6 = VLDA_dmw_lda_w_ag_idx_imm $p0, 32 ; CHECK-NEXT: $ls = MOVXM_lng_cg %bb.2 ; CHECK-NEXT: } - ; CHECK-NEXT: BUNDLE implicit-def $wh6, implicit-def $le, implicit $p0 { - ; CHECK-NEXT: $wh6 = VLDA_dmw_lda_w_ag_idx_imm $p0, 32 + ; CHECK-NEXT: BUNDLE implicit-def $wl6, implicit-def $p0, implicit-def $le, implicit killed $p0, implicit $m0 { + ; CHECK-NEXT: $wl6, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm killed $p0, $m0 ; CHECK-NEXT: $le = MOVXM_lng_cg ; CHECK-NEXT: } - ; CHECK-NEXT: $wl6, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm killed $p0, $m0 ; CHECK-NEXT: $wh6 = VLDA_dmw_lda_w_ag_idx_imm $p0, 32 ; CHECK-NEXT: $wl6, $p0 = VLDA_dmw_lda_w_ag_pstm_nrm_imm killed $p0, $m0 ; CHECK-NEXT: BUNDLE implicit-def $wh6, implicit-def $x1, implicit-def $wl1, implicit-def $wh1, implicit $p0, implicit killed $r1 { diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/large-II.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/large-II.mir index 793d9bb52aa2..23660cafff3a 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/large-II.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/large-II.mir @@ -28,26 +28,25 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: nopb ; nopa ; nops ; movxm ls, #.LBB0_2; nopv - ; CHECK-NEXT: nopa ; add.nc lc, r0, #-1 - ; CHECK-NEXT: lda r0, [p0], #4; movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopb ; lda r1, [p0], #4; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; lda r2, [p0], #4; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; lda r3, [p0], #4; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; lda r4, [p0], #4; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; lda r5, [p0], #4; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; lda r6, [p0], #4; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; lda r7, [p0], #4; st r0, [p1], #4; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; add.nc lc, r0, #-1; nopv + ; CHECK-NEXT: lda r0, [p0], #4; nopx + ; CHECK-NEXT: lda r1, [p0], #4 + ; CHECK-NEXT: lda r2, [p0], #4 + ; CHECK-NEXT: lda r3, [p0], #4 + ; CHECK-NEXT: lda r4, [p0], #4 + ; CHECK-NEXT: lda r5, [p0], #4 + ; CHECK-NEXT: lda r6, [p0], #4; movxm ls, #.LBB0_2 + ; CHECK-NEXT: lda r7, [p0], #4; st r0, [p1], #4; movxm le, #.L_LEnd0 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: lda r0, [p0], #4; nopb ; nopx ; st r1, [p1], #4 - ; CHECK-NEXT: lda r1, [p0], #4; st r2, [p1], #4 - ; CHECK-NEXT: lda r2, [p0], #4; st r3, [p1], #4 - ; CHECK-NEXT: lda r3, [p0], #4; st r4, [p1], #4 - ; CHECK-NEXT: lda r4, [p0], #4; st r5, [p1], #4 - ; CHECK-NEXT: lda r5, [p0], #4; st r6, [p1], #4 - ; CHECK-NEXT: lda r6, [p0], #4; st r7, [p1], #4 + ; CHECK-NEXT: nopb ; lda r0, [p0], #4; st r1, [p1], #4; nopxm ; nopv + ; CHECK-NEXT: nopb ; lda r1, [p0], #4; st r2, [p1], #4; nopxm ; nopv + ; CHECK-NEXT: nopb ; lda r2, [p0], #4; st r3, [p1], #4; nopxm ; nopv + ; CHECK-NEXT: nopb ; lda r3, [p0], #4; st r4, [p1], #4; nopxm ; nopv + ; CHECK-NEXT: nopb ; lda r4, [p0], #4; st r5, [p1], #4; nopxm ; nopv + ; CHECK-NEXT: nopb ; lda r5, [p0], #4; st r6, [p1], #4; nopxm ; nopv + ; CHECK-NEXT: nopb ; lda r6, [p0], #4; st r7, [p1], #4; nopxm ; nopv ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: nopb ; lda r7, [p0], #4; st r0, [p1], #4; nopxm ; nopv ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-add-or-store.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-add-or-store.mir index 57aa650906ff..4bc51ed1dd62 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-add-or-store.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-add-or-store.mir @@ -25,10 +25,9 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: nopb ; nopa ; nops ; movxm ls, #.LBB0_2; nopv - ; CHECK-NEXT: nopa ; add.nc lc, r0, #-4 - ; CHECK-NEXT: lda r0, [p1], #4; movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; add.nc lc, r0, #-4; nopv + ; CHECK-NEXT: lda r0, [p1], #4; movxm ls, #.LBB0_2 + ; CHECK-NEXT: movxm le, #.L_LEnd0 ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; nopxm ; nopv diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-add-store-renamed.prepiplined.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-add-store-renamed.prepiplined.mir index 2edd74165d76..540dd84da96b 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-add-store-renamed.prepiplined.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-add-store-renamed.prepiplined.mir @@ -28,23 +28,16 @@ ; CHECK-NEXT: add.nc lc, r0, #-1 ; CHECK-NEXT: movxm ls, #.LBB0_2 ; CHECK-NEXT: movxm le, #.L_LEnd0 + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB0_2: // %for.body + ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 + ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: .p2align 4 - ; CHECK-NEXT: .LBB0_2: // %for.body - ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; nopxm ; nopv - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop + ; CHECK-NEXT: nopa ; nopb ; nopxm ; CHECK-NEXT: add r1, r0, #1 ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: nopb ; nopa ; st r1, [p0], #4; nopxm ; nopv diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-add-store.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-add-store.mir index 236f58737c6a..63c276d728ef 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-add-store.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-add-store.mir @@ -25,10 +25,9 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: nopb ; nopa ; nops ; movxm ls, #.LBB0_2; nopv - ; CHECK-NEXT: nopa ; add.nc lc, r0, #-4 - ; CHECK-NEXT: lda r0, [p1], #4; movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; add.nc lc, r0, #-4; nopv + ; CHECK-NEXT: lda r0, [p1], #4; movxm ls, #.LBB0_2 + ; CHECK-NEXT: movxm le, #.L_LEnd0 ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; nopxm ; nopv diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-mac-store.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-mac-store.mir index 236f58737c6a..63c276d728ef 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-mac-store.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/load-mac-store.mir @@ -25,10 +25,9 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: nopb ; nopa ; nops ; movxm ls, #.LBB0_2; nopv - ; CHECK-NEXT: nopa ; add.nc lc, r0, #-4 - ; CHECK-NEXT: lda r0, [p1], #4; movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; add.nc lc, r0, #-4; nopv + ; CHECK-NEXT: lda r0, [p1], #4; movxm ls, #.LBB0_2 + ; CHECK-NEXT: movxm le, #.L_LEnd0 ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; lda r0, [p1], #4; nops ; nopxm ; nopv diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round-memdep.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round-memdep.mir index c97a7d2b7ad6..5d32aae31158 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round-memdep.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round-memdep.mir @@ -29,7 +29,7 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: nop + ; CHECK-NEXT: nopa ; nopb ; nopxm ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -39,12 +39,12 @@ ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32 ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32 ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32 + ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32 ; CHECK-NEXT: add.nc lc, r0, #-5 - ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32; movxm ls, #.LBB0_2 - ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32; movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; nops ; nopxm ; nopv + ; CHECK-NEXT: movxm ls, #.LBB0_2 + ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32; movxm le, #.L_LEnd0 ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm1, s0, [p0], #32; vsrs.s8.s32 wh0, cm0, s1; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; vsrs.s8.s32 wh2, cm1, s1; nopxm ; nopv @@ -52,9 +52,9 @@ ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32; vsrs.s8.s32 wh0, cm0, s1; vups.s32.s8 cm2, wh0, s1 - ; CHECK-NEXT: nop - ; CHECK-NEXT: vsrs.s8.s32 wh2, cm1, s1 + ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm1, s0, [p0], #32; vsrs.s8.s32 wh0, cm0, s1; nopx ; vups.s32.s8 cm2, wh0, s1; nopv + ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; vsrs.s8.s32 wh2, cm1, s1; nopxm ; nopv ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; vst.srs.d8.s32 cm2, s0, [p1], #32; nopx ; vups.s32.s8 cm3, wh2, s1; nopv ; CHECK-NEXT: // %bb.3: // %loop.exit diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round-war.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round-war.mir index 6681d4623bc1..7c839c2f4719 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round-war.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round-war.mir @@ -28,18 +28,18 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32; nopxm + ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; nops ; nopxm ; nopv ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32 ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32 ; CHECK-NEXT: nop - ; CHECK-NEXT: add.nc lc, r0, #-4 - ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32; movxm ls, #.LBB0_2 - ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32; movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopb ; nopa ; vsrs.s8.s32 wh0, cm0, s1; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm1, s0, [p0], #32; nops ; nopxm ; nopv + ; CHECK-NEXT: nop + ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32 + ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32 + ; CHECK-NEXT: vsrs.s8.s32 wh0, cm0, s1; add.nc lc, r0, #-4 + ; CHECK-NEXT: movxm ls, #.LBB0_2 + ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32; movxm le, #.L_LEnd0 ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; vsrs.s8.s32 wh2, cm1, s1; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; vsrs.s8.s32 wh0, cm0, s1; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov crSRSSign, r0; nopv @@ -48,8 +48,8 @@ ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; vsrs.s8.s32 wh2, cm1, s1; nopx ; vups.s32.s8 cm3, wh2, s1; nopv - ; CHECK-NEXT: nopa ; nopb ; nopx ; vsrs.s8.s32 wh0, cm0, s1 - ; CHECK-NEXT: vst.srs.d8.s32 cm2, s0, [p1], #32 + ; CHECK-NEXT: nopb ; nopa ; vsrs.s8.s32 wh0, cm0, s1; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; vst.srs.d8.s32 cm2, s0, [p1], #32; nopxm ; nopv ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm1, s0, [p0], #32; vst.srs.d8.s32 cm3, s0, [p1], #32; nopx ; vups.s32.s8 cm2, wh0, s1; nopv ; CHECK-NEXT: // %bb.3: // %loop.exit diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir index 78c033e73a37..b80b580b1fa2 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/round.mir @@ -34,18 +34,18 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32; nopxm + ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; nops ; nopxm ; nopv ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32 ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32 ; CHECK-NEXT: nop - ; CHECK-NEXT: add.nc lc, r0, #-4 - ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32; movxm ls, #.LBB0_2 - ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32; movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopb ; nopa ; vsrs.s8.s32 wh0, cm0, s1; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm1, s0, [p0], #32; nops ; nopxm ; nopv + ; CHECK-NEXT: nop + ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32 + ; CHECK-NEXT: vlda.ups.s32.s8 cm0, s0, [p0], #32 + ; CHECK-NEXT: vsrs.s8.s32 wh0, cm0, s1; add.nc lc, r0, #-4 + ; CHECK-NEXT: movxm ls, #.LBB0_2 + ; CHECK-NEXT: vlda.ups.s32.s8 cm1, s0, [p0], #32; movxm le, #.L_LEnd0 ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; vsrs.s8.s32 wh2, cm1, s1; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; vsrs.s8.s32 wh0, cm0, s1; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv @@ -54,8 +54,8 @@ ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm0, s0, [p0], #32; vsrs.s8.s32 wh2, cm1, s1; nopx ; vups.s32.s8 cm3, wh2, s1; nopv - ; CHECK-NEXT: nopa ; nopb ; nopx ; vsrs.s8.s32 wh0, cm0, s1 - ; CHECK-NEXT: vst.srs.s8.s32 cm2, s0, [p1], #32 + ; CHECK-NEXT: nopb ; nopa ; vsrs.s8.s32 wh0, cm0, s1; nopxm ; nopv + ; CHECK-NEXT: nopb ; nopa ; vst.srs.s8.s32 cm2, s0, [p1], #32; nopxm ; nopv ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: nopb ; vlda.ups.s32.s8 cm1, s0, [p0], #32; vst.srs.s8.s32 cm3, s0, [p1], #32; nopx ; vups.s32.s8 cm2, wh0, s1; nopv ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/small-II.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/small-II.mir index dd4da83e4766..51dde6de55fb 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/small-II.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/small-II.mir @@ -28,12 +28,11 @@ ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: nopb ; nopa ; nops ; movxm ls, #.LBB0_2; nopv - ; CHECK-NEXT: nopa ; add.nc lc, r0, #-2 - ; CHECK-NEXT: lda r0, [p0], #4; movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopb ; lda r1, [p0], #4; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; lda r2, [p0], #4; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; lda r3, [p0], #4; nops ; nopxm ; nopv + ; CHECK-NEXT: add.nc lc, r0, #-2 + ; CHECK-NEXT: lda r0, [p0], #4 + ; CHECK-NEXT: lda r1, [p0], #4 + ; CHECK-NEXT: lda r2, [p0], #4; movxm ls, #.LBB0_2 + ; CHECK-NEXT: lda r3, [p0], #4; movxm le, #.L_LEnd0 ; CHECK-NEXT: nopb ; lda r0, [p0], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; lda r1, [p0], #4; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; lda r2, [p0], #4; nops ; nopxm ; nopv @@ -42,8 +41,8 @@ ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: nopb ; lda r0, [p0], #4; st r1, [p1], #4; nopxm ; nopv - ; CHECK-NEXT: lda r1, [p0], #4; st r2, [p1], #4; nopx - ; CHECK-NEXT: lda r2, [p0], #4; st r3, [p1], #4 + ; CHECK-NEXT: nopb ; lda r1, [p0], #4; st r2, [p1], #4; nopxm ; nopv + ; CHECK-NEXT: nopb ; lda r2, [p0], #4; st r3, [p1], #4; nopxm ; nopv ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: nopb ; lda r3, [p0], #4; st r0, [p1], #4; nopxm ; nopv ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/unpack-extract-block-cycles.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/unpack-extract-block-cycles.mir index ee7a91f3973b..3010ba1bc1bc 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/unpack-extract-block-cycles.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/unpack-extract-block-cycles.mir @@ -14,51 +14,6 @@ --- | define dso_local void @unpackExtract(ptr addrspace(5) noalias nocapture writeonly %d, i32 noundef %n) local_unnamed_addr #0 { - ; CHECK-LABEL: unpackExtract: - ; CHECK: .p2align 4 - ; CHECK-NEXT: // %bb.0: // %entry - ; CHECK-NEXT: mova r1, #0; nopb ; nopxm - ; CHECK-NEXT: ge r1, r1, r0 - ; CHECK-NEXT: jnz r1, #.LBB0_4 - ; CHECK-NEXT: nop // Delay Slot 5 - ; CHECK-NEXT: nop // Delay Slot 4 - ; CHECK-NEXT: nop // Delay Slot 3 - ; CHECK-NEXT: nop // Delay Slot 2 - ; CHECK-NEXT: nop // Delay Slot 1 - ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: add.nc lc, r0, #-1 - ; CHECK-NEXT: movxm ls, #.LBB0_2 - ; CHECK-NEXT: movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vunpack.d16.d8 x3, wh1; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopb ; nopa ; nops ; movx r1, #0; nopm ; nopv - ; CHECK-NEXT: .p2align 4 - ; CHECK-NEXT: .LBB0_2: // %for.body - ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: nopa ; vunpack.d16.d8 x3, wh1; nopxm - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vextract.s8 r1, x3, r16; nopv - ; CHECK-NEXT: // %bb.3: // %loop.exit - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: vextract.s8 r1, x3, r16 - ; CHECK-NEXT: nop - ; CHECK-NEXT: vmov wh4, wh2 - ; CHECK-NEXT: .p2align 4 - ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup - ; CHECK-NEXT: nopa ; ret lr - ; CHECK-NEXT: nop // Delay Slot 5 - ; CHECK-NEXT: nop // Delay Slot 4 - ; CHECK-NEXT: nop // Delay Slot 3 - ; CHECK-NEXT: nop // Delay Slot 2 - ; CHECK-NEXT: nop // Delay Slot 1 entry: %cmp5 = icmp sgt i32 %n, 0 br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup @@ -147,3 +102,5 @@ body: | DelayedSchedBarrier ... +## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +# CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/unpack-extract-war.mir b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/unpack-extract-war.mir index fd1ab0d1477a..5e4da036e4c3 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/unpack-extract-war.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/postpipeliner/unpack-extract-war.mir @@ -14,51 +14,6 @@ --- | define dso_local void @unpackExtract(ptr addrspace(5) noalias nocapture writeonly %d, i32 noundef %n) local_unnamed_addr #0 { - ; CHECK-LABEL: unpackExtract: - ; CHECK: .p2align 4 - ; CHECK-NEXT: // %bb.0: // %entry - ; CHECK-NEXT: mova r1, #0; nopb ; nopxm - ; CHECK-NEXT: ge r1, r1, r0 - ; CHECK-NEXT: jnz r1, #.LBB0_4 - ; CHECK-NEXT: nop // Delay Slot 5 - ; CHECK-NEXT: nop // Delay Slot 4 - ; CHECK-NEXT: nop // Delay Slot 3 - ; CHECK-NEXT: nop // Delay Slot 2 - ; CHECK-NEXT: nop // Delay Slot 1 - ; CHECK-NEXT: // %bb.1: // %for.body.preheader - ; CHECK-NEXT: add.nc lc, r0, #-7 - ; CHECK-NEXT: movxm ls, #.LBB0_2 - ; CHECK-NEXT: movxm le, #.L_LEnd0 - ; CHECK-NEXT: vunpack.d16.d8 x3, wh1; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vunpack.d16.d8 x3, wh1; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vunpack.d16.d8 x3, wh1; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vunpack.d16.d8 x3, wh1; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vunpack.d16.d8 x3, wh1; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vunpack.d16.d8 x3, wh1; nopa ; nops ; nopxm ; nopv - ; CHECK-NEXT: vunpack.d16.d8 x3, wh1; nopa ; nops ; movx r1, #0; nopm ; nopv - ; CHECK-NEXT: .p2align 4 - ; CHECK-NEXT: .LBB0_2: // %for.body - ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: .L_LEnd0: - ; CHECK-NEXT: vunpack.d16.d8 x3, wh1; nopa ; nops ; nopx ; vextract.s8 r1, x3, r16; nopv - ; CHECK-NEXT: // %bb.3: // %loop.exit - ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vextract.s8 r1, x3, r16; nopv - ; CHECK-NEXT: nopa ; vextract.s8 r1, x3, r16 - ; CHECK-NEXT: vextract.s8 r1, x3, r16 - ; CHECK-NEXT: vextract.s8 r1, x3, r16 - ; CHECK-NEXT: vextract.s8 r1, x3, r16 - ; CHECK-NEXT: vextract.s8 r1, x3, r16 - ; CHECK-NEXT: vextract.s8 r1, x3, r16 - ; CHECK-NEXT: nop - ; CHECK-NEXT: vmov wh1, wh2 - ; CHECK-NEXT: .p2align 4 - ; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup - ; CHECK-NEXT: nopa ; ret lr - ; CHECK-NEXT: nop // Delay Slot 5 - ; CHECK-NEXT: nop // Delay Slot 4 - ; CHECK-NEXT: nop // Delay Slot 3 - ; CHECK-NEXT: nop // Delay Slot 2 - ; CHECK-NEXT: nop // Delay Slot 1 entry: %cmp5 = icmp sgt i32 %n, 0 br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup @@ -147,3 +102,5 @@ body: | DelayedSchedBarrier ... +## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +# CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AIE/aie2p/AA-unroll-iterations.mir b/llvm/test/CodeGen/AIE/aie2p/AA-unroll-iterations.mir index d02739b09fc6..a265951c6ca5 100644 --- a/llvm/test/CodeGen/AIE/aie2p/AA-unroll-iterations.mir +++ b/llvm/test/CodeGen/AIE/aie2p/AA-unroll-iterations.mir @@ -16,16 +16,15 @@ ; CHECK-LABEL: _Z1fPii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry - ; CHECK-NEXT: nopa ; nopb ; nops ; movxm ls, #.LBB0_1; nopv - ; CHECK-NEXT: mova m0, #4; nopx + ; CHECK-NEXT: mova m0, #4; nopb ; nopxm ; nops ; CHECK-NEXT: mova dn0, #16; mov m1, m0 ; CHECK-NEXT: mova dc1, #0; movx r1, #4; mov dn1, dn0 ; CHECK-NEXT: movs p1, p0; add.nc lc, r1, #-2 ; CHECK-NEXT: movs dj1, m0; mov dc0, dc1 - ; CHECK-NEXT: lda.2d r1, [p1], d1; movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: lda.2d r1, [p1], d1 + ; CHECK-NEXT: nop + ; CHECK-NEXT: movxm ls, #.LBB0_1 + ; CHECK-NEXT: movxm le, #.L_LEnd0 ; CHECK-NEXT: lda.2d r1, [p1], d1; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; movx r0, #10; nopm ; nopv @@ -33,9 +32,9 @@ ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: lda.2d r1, [p1], d1; nopxm - ; CHECK-NEXT: st.2d r1, [p0], d0 - ; CHECK-NEXT: nop + ; CHECK-NEXT: lda.2d r1, [p1], d1; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; st.2d r1, [p0], d0; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: nopa ; nopb ; nops ; mul r1, r1, r0; nopm ; nopv ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup diff --git a/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll b/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll index 0fc92db52f5e..222a3917387c 100644 --- a/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll +++ b/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_convert.ll @@ -17,7 +17,7 @@ define weak_odr dso_local void @convert_bf16_to_bfp16(ptr noalias %in, ptr noali ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: lda r0, [p2, #0]; nopb ; nops ; nopx ; mov m0, #4; nopv -; CHECK-NEXT: padda [p2], m0; nopx +; CHECK-NEXT: padda [p2], m0 ; CHECK-NEXT: lda dn0, [p2], #4 ; CHECK-NEXT: lda m1, [p2], #4 ; CHECK-NEXT: nop @@ -26,25 +26,25 @@ define weak_odr dso_local void @convert_bf16_to_bfp16(ptr noalias %in, ptr noali ; CHECK-NEXT: mova dj0, #0; mov r26, r24 ; CHECK-NEXT: vldb.fill.512 [p0, lf0, r24]; mov dj1, dj0 ; CHECK-NEXT: movs dc1, dj0; vldb.pop.512 x0, [p0, lf0, r24]; mov dn1, dn0 -; CHECK-NEXT: vldb.pop.512.2d x2, [p0, lf0, r24, d1]; movxm ls, #.LBB0_1 -; CHECK-NEXT: movxm le, #.L_LEnd0 -; CHECK-NEXT: add.nc lc, r0, #-2 -; CHECK-NEXT: lda m0, [p2, #0]; nopb ; nops ; nopxm ; nopv -; CHECK-NEXT: nopa ; vldb.fill.512 [p0, lf0, r24]; nops ; nopxm ; nopv -; CHECK-NEXT: nopa ; vldb.pop.512 x0, [p0, lf0, r24]; nops ; nopxm ; nopv -; CHECK-NEXT: nopa ; vldb.pop.512.2d x2, [p0, lf0, r24, d1]; nops ; nopxm ; nopv +; CHECK-NEXT: vldb.pop.512.2d x2, [p0, lf0, r24, d1] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: lda m0, [p2, #0] +; CHECK-NEXT: vldb.fill.512 [p0, lf0, r24]; movxm ls, #.LBB0_1 +; CHECK-NEXT: vldb.pop.512 x0, [p0, lf0, r24]; movxm le, #.L_LEnd0 +; CHECK-NEXT: vldb.pop.512.2d x2, [p0, lf0, r24, d1]; add.nc lc, r0, #-2 ; CHECK-NEXT: nopa ; nopb ; nops ; nopx ; vconv.fp32.bf16 cml0, x0; nopv -; CHECK-NEXT: nopa ; nopb ; nops ; nopx ; vconv.fp32.bf16 cmh0, x2; nopv -; CHECK-NEXT: nopa ; nopb ; movs dc0, dj0; nopx ; mov p2, p1; nopv +; CHECK-NEXT: nops ; vconv.fp32.bf16 cmh0, x2 +; CHECK-NEXT: movs dc0, dj0; mov p2, p1 ; CHECK-NEXT: // implicit-def: $sf ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: nopa ; vldb.fill.512 [p0, lf0, r24]; nopx ; vst.push.576.conv.bfp16ebs8.fp32 dm0, [p2, sf, r26] -; CHECK-NEXT: vldb.pop.512 x0, [p0, lf0, r24]; vst.flush.512.conv [p2, sf, r26] -; CHECK-NEXT: vldb.pop.512.2d x2, [p0, lf0, r24, d1]; vst.flush.512.conv.2d [p2, sf, r26, d0] -; CHECK-NEXT: vconv.fp32.bf16 cml0, x0 -; CHECK-NEXT: vconv.fp32.bf16 cmh0, x2 +; CHECK-NEXT: nopa ; vldb.fill.512 [p0, lf0, r24]; vst.push.576.conv.bfp16ebs8.fp32 dm0, [p2, sf, r26]; nopxm ; nopv +; CHECK-NEXT: nopa ; vldb.pop.512 x0, [p0, lf0, r24]; vst.flush.512.conv [p2, sf, r26]; nopxm ; nopv +; CHECK-NEXT: nopa ; vldb.pop.512.2d x2, [p0, lf0, r24, d1]; vst.flush.512.conv.2d [p2, sf, r26, d0]; nopxm ; nopv +; CHECK-NEXT: nopa ; nopb ; nops ; nopx ; vconv.fp32.bf16 cml0, x0; nopv +; CHECK-NEXT: nopa ; nopb ; nops ; nopx ; vconv.fp32.bf16 cmh0, x2; nopv ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup diff --git a/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_kernel_red.ll b/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_kernel_red.ll index d2ff374d2d14..015d4367e595 100644 --- a/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_kernel_red.ll +++ b/llvm/test/CodeGen/AIE/aie2p/end-to-end/conv2d_bfp16_kernel_red.ll @@ -19,8 +19,8 @@ define dso_local void @conv2d_bfp16.for.body90.i(<32 x i32> %fW.sroa.0.1489.i, i ; CHECK-LABEL: conv2d_bfp16.for.body90.i: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %newFuncRoot -; CHECK-NEXT: paddxm [sp], #64 -; CHECK-NEXT: st p6, [sp, #-60]; nopx // 4-byte Folded Spill +; CHECK-NEXT: nopa ; paddxm [sp], #64 +; CHECK-NEXT: st p6, [sp, #-60] // 4-byte Folded Spill ; CHECK-NEXT: mov p6, sp ; CHECK-NEXT: padda [p6], #-320 ; CHECK-NEXT: vlda bmll3, [p6, #0] @@ -65,12 +65,12 @@ define dso_local void @conv2d_bfp16.for.body90.i(<32 x i32> %fW.sroa.0.1489.i, i ; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.pop.576 ex0, [p1, lf1, r25]; movs dc0, dc4; mov m0, p2 ; CHECK-NEXT: vlda.pop.576 ex4, [p0, lf0, r24]; vldb.pop.576.3d ex2, [p1, lf1, r25, d0] ; CHECK-NEXT: vldb.fill.512 [p1, lf1, r25] -; CHECK-NEXT: vlda.pop.576 ex6, [p0, lf0, r24, m1]; vldb.fill.512 [p1, lf1, r25]; add r1, r6, #-1 -; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.pop.576 ex0, [p1, lf1, r25]; movxm ls, #.LBB0_1 -; CHECK-NEXT: vlda.pop.576 ex4, [p0, lf0, r24]; vldb.pop.576.3d ex2, [p1, lf1, r25, d0]; movxm le, #.L_LEnd0 -; CHECK-NEXT: vldb.fill.512 [p1, lf1, r25]; add.nc lc, r1, #-4 -; CHECK-NEXT: vlda.pop.576 ex6, [p0, lf0, r24, m1]; vldb.fill.512 [p1, lf1, r25]; nops ; nopxm ; nopv -; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.pop.576 ex0, [p1, lf1, r25]; nops ; nopxm ; nopv +; CHECK-NEXT: vlda.pop.576 ex6, [p0, lf0, r24, m1]; vldb.fill.512 [p1, lf1, r25] +; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.pop.576 ex0, [p1, lf1, r25] +; CHECK-NEXT: vlda.pop.576 ex4, [p0, lf0, r24]; vldb.pop.576.3d ex2, [p1, lf1, r25, d0]; add r1, r6, #-1 +; CHECK-NEXT: vldb.fill.512 [p1, lf1, r25]; movxm ls, #.LBB0_1 +; CHECK-NEXT: vlda.pop.576 ex6, [p0, lf0, r24, m1]; vldb.fill.512 [p1, lf1, r25]; movxm le, #.L_LEnd0 +; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.pop.576 ex0, [p1, lf1, r25]; add.nc lc, r1, #-4 ; CHECK-NEXT: vlda.pop.576 ex4, [p0, lf0, r24]; vldb.pop.576.3d ex2, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle ex8, ex0, ex2, r4; nopv ; CHECK-NEXT: mova r0, #780; vldb.fill.512 [p1, lf1, r25]; nops ; nopx ; vshuffle ex10, ex0, ex2, r5; nopv ; CHECK-NEXT: vlda.pop.576 ex6, [p0, lf0, r24, m1]; vldb.fill.512 [p1, lf1, r25]; nops ; nopxm ; vmac.f dm0, dm0, ex8, ex4, r0 @@ -79,9 +79,9 @@ define dso_local void @conv2d_bfp16.for.body90.i(<32 x i32> %fW.sroa.0.1489.i, i ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: // %for.body90.i ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldb.fill.512 [p1, lf1, r25]; nopx ; vshuffle ex10, ex0, ex2, r5; vmac.f dm3, dm3, ex10, ex6, r0 -; CHECK-NEXT: vlda.pop.576 ex6, [p0, lf0, r24, m1]; vldb.fill.512 [p1, lf1, r25]; vmac.f dm0, dm0, ex8, ex4, r0 -; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.pop.576 ex0, [p1, lf1, r25]; vmac.f dm1, dm1, ex10, ex4, r0 +; CHECK-NEXT: nopa ; vldb.fill.512 [p1, lf1, r25]; nops ; nopx ; vshuffle ex10, ex0, ex2, r5; vmac.f dm3, dm3, ex10, ex6, r0 +; CHECK-NEXT: vlda.pop.576 ex6, [p0, lf0, r24, m1]; vldb.fill.512 [p1, lf1, r25]; nops ; nopxm ; vmac.f dm0, dm0, ex8, ex4, r0 +; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.pop.576 ex0, [p1, lf1, r25]; nops ; nopxm ; vmac.f dm1, dm1, ex10, ex4, r0 ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: vlda.pop.576 ex4, [p0, lf0, r24]; vldb.pop.576.3d ex2, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle ex8, ex0, ex2, r4; vmac.f dm2, dm2, ex8, ex6, r0 ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup89.i.exitStub diff --git a/llvm/test/CodeGen/AIE/aie2p/end-to-end/gelu-templated.ll b/llvm/test/CodeGen/AIE/aie2p/end-to-end/gelu-templated.ll index 4a7786cdde4c..22e201c59a13 100644 --- a/llvm/test/CodeGen/AIE/aie2p/end-to-end/gelu-templated.ll +++ b/llvm/test/CodeGen/AIE/aie2p/end-to-end/gelu-templated.ll @@ -50,14 +50,14 @@ define void @gelu_fn(ptr noalias %ifm, ptr noalias %ofm, ptr nonnull align 64 de ; CHECK-NEXT: vconv.bf16.fp32 x10, cml3; vmov cml2, cml0; vmul.f dm3, x7, x2, r2 ; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64 ; CHECK-NEXT: nop -; CHECK-NEXT: mova r3, #0; vconv.bf16.fp32 x5, cml1; movxm ls, #.LBB0_1 -; CHECK-NEXT: mova r4, #-5; nopb ; vconv.bf16.fp32 x8, cml4; movxm le, #.L_LEnd0; vmul.f dm4, x10, x4, r2 -; CHECK-NEXT: vconv.bf16.fp32 x7, cml2; lshl r4, r1, r4; mov s0, r3; vmul.f dm4, x5, x4, r2 -; CHECK-NEXT: vconv.bf16.fp32 x5, cml3; add.nc lc, r4, #-7; vadd.f dm2, dm1, dm2, r0 -; CHECK-NEXT: nopa ; nopb ; nops ; nopx ; vmov cml2, cml0; vmul.f dm3, x7, x2, r2 -; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; nopb ; vfloor.s32.bf16 x1, wl8, s0; nopxm ; vmul.f dm4, x5, x4, r2 -; CHECK-NEXT: nopa ; nopb ; vfloor.s32.bf16 x3, wh8, s0; nopxm ; nopv -; CHECK-NEXT: mova r1, #2; nopb ; vconv.bf16.fp32 x10, cml4; nopx ; vbcst.16 x6, r3; nopv +; CHECK-NEXT: vconv.bf16.fp32 x5, cml1 +; CHECK-NEXT: vconv.bf16.fp32 x8, cml4; movxm ls, #.LBB0_1; vmul.f dm4, x10, x4, r2 +; CHECK-NEXT: mova r3, #0; nopb ; vconv.bf16.fp32 x7, cml2; movxm le, #.L_LEnd0; vmul.f dm4, x5, x4, r2 +; CHECK-NEXT: vconv.bf16.fp32 x5, cml3; mov s0, r3; vadd.f dm2, dm1, dm2, r0 +; CHECK-NEXT: vmov cml2, cml0; vmul.f dm3, x7, x2, r2 +; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; vfloor.s32.bf16 x1, wl8, s0; movx r4, #-5; vmul.f dm4, x5, x4, r2 +; CHECK-NEXT: vfloor.s32.bf16 x3, wh8, s0; lshl r4, r1, r4; vbcst.16 x6, r3 +; CHECK-NEXT: mova r1, #2; vconv.bf16.fp32 x10, cml4; add.nc lc, r4, #-7 ; CHECK-NEXT: nopa ; nopb ; nops ; nopx ; vshuffle x1, x1, x3, r1; nopv ; CHECK-NEXT: nopa ; nopb ; vfloor.s32.bf16 x1, wl10, s0; nopx ; vmin_ge.16 x3, r16, x1, x0, vaddsign1; nopv ; CHECK-NEXT: vconv.bf16.fp32 x8, cml4; nopx ; vmax_lt.16 x3, r16, x3, x6, vaddsign1 @@ -65,11 +65,11 @@ define void @gelu_fn(ptr noalias %ifm, ptr noalias %ofm, ptr nonnull align 64 de ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: nopa ; vconv.bf16.fp32 x3, cml2; nopx ; vadd.f dm2, dm1, dm2, r0 -; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; vst x3, [p1], #64; vmov cml2, cml0 -; CHECK-NEXT: vconv.bf16.fp32 x5, cml3; vshuffle x1, x1, x10, r1; vmul.f dm3, x3, x2, r2 -; CHECK-NEXT: vfloor.s32.bf16 x10, wh8, s0; vmin_ge.16 x7, r16, x1, x0, vaddsign1 -; CHECK-NEXT: vfloor.s32.bf16 x1, wl8, s0; vmax_lt.16 x3, r16, x7, x6, vaddsign1 +; CHECK-NEXT: nopa ; nopb ; vconv.bf16.fp32 x3, cml2; nopxm ; vadd.f dm2, dm1, dm2, r0 +; CHECK-NEXT: vlda.conv.fp32.bf16 cml1, [p0], #64; nopb ; vst x3, [p1], #64; nopx ; vmov cml2, cml0; nopv +; CHECK-NEXT: nopa ; nopb ; vconv.bf16.fp32 x5, cml3; nopx ; vshuffle x1, x1, x10, r1; vmul.f dm3, x3, x2, r2 +; CHECK-NEXT: nopa ; nopb ; vfloor.s32.bf16 x10, wh8, s0; nopx ; vmin_ge.16 x7, r16, x1, x0, vaddsign1; nopv +; CHECK-NEXT: nopa ; nopb ; vfloor.s32.bf16 x1, wl8, s0; nopx ; vmax_lt.16 x3, r16, x7, x6, vaddsign1; nopv ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: nopa ; nopb ; vconv.bf16.fp32 x8, cml4; nopxm ; vmul.f dm4, x5, x4, r2 ; CHECK-NEXT: // %bb.2: diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/Conv2D_bfp16_conv.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/Conv2D_bfp16_conv.mir index 5d9b58313f94..39efa78f6d84 100644 --- a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/Conv2D_bfp16_conv.mir +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/Conv2D_bfp16_conv.mir @@ -30,7 +30,7 @@ ; CONSERVATIVE-NEXT: nop // Delay Slot 2 ; CONSERVATIVE-NEXT: nop // Delay Slot 1 ; CONSERVATIVE-NEXT: // %bb.1: - ; CONSERVATIVE-NEXT: mova m0, #12; nopb ; nops ; nopx ; mov p3, p2; nopv + ; CONSERVATIVE-NEXT: mova m0, #12; nopb ; nopx ; mov p3, p2 ; CONSERVATIVE-NEXT: padda [p3], m0 ; CONSERVATIVE-NEXT: lda m0, [p3], #-4 ; CONSERVATIVE-NEXT: lda m1, [p3], #-4 @@ -42,24 +42,24 @@ ; CONSERVATIVE-NEXT: mov r26, r24 ; CONSERVATIVE-NEXT: vldb.fill.512 [p0, lf0, r24]; mov dc1, dj0 ; CONSERVATIVE-NEXT: vldb.pop.512 x0, [p0, lf0, r24]; mov dn1, dn0 - ; CONSERVATIVE-NEXT: vldb.pop.512.2d x2, [p0, lf0, r24, d1]; add.nc lc, r0, #-2 - ; CONSERVATIVE-NEXT: movxm ls, #.LBB0_2 - ; CONSERVATIVE-NEXT: movxm le, #.L_LEnd0 - ; CONSERVATIVE-NEXT: nopa ; nopb ; nops ; nopxm ; nopv - ; CONSERVATIVE-NEXT: nopa ; vldb.fill.512 [p0, lf0, r24]; nops ; nopxm ; nopv - ; CONSERVATIVE-NEXT: nopa ; vldb.pop.512 x0, [p0, lf0, r24]; nops ; nopxm ; nopv - ; CONSERVATIVE-NEXT: nopa ; vldb.pop.512.2d x2, [p0, lf0, r24, d1]; nops ; nopx ; mov p2, p1; nopv + ; CONSERVATIVE-NEXT: vldb.pop.512.2d x2, [p0, lf0, r24, d1] + ; CONSERVATIVE-NEXT: nop + ; CONSERVATIVE-NEXT: nop + ; CONSERVATIVE-NEXT: mov p2, p1 + ; CONSERVATIVE-NEXT: vldb.fill.512 [p0, lf0, r24]; add.nc lc, r0, #-2 + ; CONSERVATIVE-NEXT: vldb.pop.512 x0, [p0, lf0, r24]; movxm ls, #.LBB0_2 + ; CONSERVATIVE-NEXT: vldb.pop.512.2d x2, [p0, lf0, r24, d1]; movxm le, #.L_LEnd0 ; CONSERVATIVE-NEXT: nopa ; nopb ; nops ; nopx ; vconv.fp32.bf16 cml0, x0; nopv - ; CONSERVATIVE-NEXT: nopa ; nopb ; nops ; nopx ; vconv.fp32.bf16 cmh0, x2; nopv - ; CONSERVATIVE-NEXT: nopa ; nopb ; nops ; nopx ; mov dc0, dj0; nopv + ; CONSERVATIVE-NEXT: nopa ; nopb ; nopx ; vconv.fp32.bf16 cmh0, x2 + ; CONSERVATIVE-NEXT: mov dc0, dj0 ; CONSERVATIVE-NEXT: // implicit-def: $sf ; CONSERVATIVE-NEXT: .p2align 4 ; CONSERVATIVE-NEXT: .LBB0_2: // =>This Inner Loop Header: Depth=1 - ; CONSERVATIVE-NEXT: nopa ; vldb.fill.512 [p0, lf0, r24]; nopx ; vst.push.576.conv.bfp16ebs8.fp32 dm0, [p2, sf, r26] - ; CONSERVATIVE-NEXT: vldb.pop.512 x0, [p0, lf0, r24]; vst.flush.512.conv [p2, sf, r26] - ; CONSERVATIVE-NEXT: vldb.pop.512.2d x2, [p0, lf0, r24, d1]; vst.flush.512.conv.2d [p2, sf, r26, d0] - ; CONSERVATIVE-NEXT: vconv.fp32.bf16 cml0, x0 - ; CONSERVATIVE-NEXT: vconv.fp32.bf16 cmh0, x2 + ; CONSERVATIVE-NEXT: nopa ; vldb.fill.512 [p0, lf0, r24]; vst.push.576.conv.bfp16ebs8.fp32 dm0, [p2, sf, r26]; nopxm ; nopv + ; CONSERVATIVE-NEXT: nopa ; vldb.pop.512 x0, [p0, lf0, r24]; vst.flush.512.conv [p2, sf, r26]; nopxm ; nopv + ; CONSERVATIVE-NEXT: nopa ; vldb.pop.512.2d x2, [p0, lf0, r24, d1]; vst.flush.512.conv.2d [p2, sf, r26, d0]; nopxm ; nopv + ; CONSERVATIVE-NEXT: nopa ; nopb ; nops ; nopx ; vconv.fp32.bf16 cml0, x0; nopv + ; CONSERVATIVE-NEXT: nopa ; nopb ; nops ; nopx ; vconv.fp32.bf16 cmh0, x2; nopv ; CONSERVATIVE-NEXT: .L_LEnd0: ; CONSERVATIVE-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CONSERVATIVE-NEXT: // %bb.3: @@ -102,8 +102,8 @@ ; OPTIMISTIC-NEXT: nop // Delay Slot 2 ; OPTIMISTIC-NEXT: nop // Delay Slot 1 ; OPTIMISTIC-NEXT: // %bb.1: - ; OPTIMISTIC-NEXT: mova m0, #12; nopb ; nopx ; mov p3, p2 - ; OPTIMISTIC-NEXT: padda [p3], m0 + ; OPTIMISTIC-NEXT: mova m0, #12; nopb ; nops ; nopx ; mov p3, p2; nopv + ; OPTIMISTIC-NEXT: padda [p3], m0; nopx ; OPTIMISTIC-NEXT: lda m0, [p3], #-4 ; OPTIMISTIC-NEXT: lda m1, [p3], #-4 ; OPTIMISTIC-NEXT: lda dn0, [p3, #0] @@ -114,12 +114,12 @@ ; OPTIMISTIC-NEXT: mov r26, r24 ; OPTIMISTIC-NEXT: vldb.fill.512 [p0, lf0, r24]; mov dc1, dj0 ; OPTIMISTIC-NEXT: vldb.pop.512 x0, [p0, lf0, r24]; mov dn1, dn0 - ; OPTIMISTIC-NEXT: vldb.pop.512.2d x2, [p0, lf0, r24, d1]; add.nc lc, r0, #-3 - ; OPTIMISTIC-NEXT: movxm ls, #.LBB0_2 - ; OPTIMISTIC-NEXT: vldb.fill.512 [p0, lf0, r24]; movxm le, #.L_LEnd0 - ; OPTIMISTIC-NEXT: nopa ; vldb.pop.512 x0, [p0, lf0, r24]; nops ; nopxm ; nopv - ; OPTIMISTIC-NEXT: nopa ; vldb.pop.512.2d x2, [p0, lf0, r24, d1]; nops ; nopxm ; nopv - ; OPTIMISTIC-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; OPTIMISTIC-NEXT: vldb.pop.512.2d x2, [p0, lf0, r24, d1] + ; OPTIMISTIC-NEXT: nop + ; OPTIMISTIC-NEXT: vldb.fill.512 [p0, lf0, r24] + ; OPTIMISTIC-NEXT: vldb.pop.512 x0, [p0, lf0, r24]; add.nc lc, r0, #-3 + ; OPTIMISTIC-NEXT: vldb.pop.512.2d x2, [p0, lf0, r24, d1]; movxm ls, #.LBB0_2 + ; OPTIMISTIC-NEXT: movxm le, #.L_LEnd0 ; OPTIMISTIC-NEXT: nopa ; vldb.fill.512 [p0, lf0, r24]; nops ; nopx ; mov p2, p1; nopv ; OPTIMISTIC-NEXT: nopa ; vldb.pop.512 x0, [p0, lf0, r24]; nops ; nopx ; vconv.fp32.bf16 cml0, x0; nopv ; OPTIMISTIC-NEXT: nopa ; vldb.pop.512.2d x2, [p0, lf0, r24, d1]; nops ; nopx ; vconv.fp32.bf16 cmh0, x2; nopv @@ -127,9 +127,9 @@ ; OPTIMISTIC-NEXT: // implicit-def: $sf ; OPTIMISTIC-NEXT: .p2align 4 ; OPTIMISTIC-NEXT: .LBB0_2: // =>This Inner Loop Header: Depth=1 - ; OPTIMISTIC-NEXT: nopa ; vldb.fill.512 [p0, lf0, r24]; nopx ; vst.push.576.conv.bfp16ebs8.fp32 dm0, [p2, sf, r26] - ; OPTIMISTIC-NEXT: vst.flush.512.conv [p2, sf, r26]; vldb.pop.512 x0, [p0, lf0, r24]; vconv.fp32.bf16 cml0, x0 - ; OPTIMISTIC-NEXT: vst.flush.512.conv.2d [p2, sf, r26, d0]; vldb.pop.512.2d x2, [p0, lf0, r24, d1]; vconv.fp32.bf16 cmh0, x2 + ; OPTIMISTIC-NEXT: nopa ; vldb.fill.512 [p0, lf0, r24]; vst.push.576.conv.bfp16ebs8.fp32 dm0, [p2, sf, r26]; nopxm ; nopv + ; OPTIMISTIC-NEXT: nopa ; vldb.pop.512 x0, [p0, lf0, r24]; vst.flush.512.conv [p2, sf, r26]; nopx ; vconv.fp32.bf16 cml0, x0; nopv + ; OPTIMISTIC-NEXT: nopa ; vldb.pop.512.2d x2, [p0, lf0, r24, d1]; vst.flush.512.conv.2d [p2, sf, r26, d0]; nopx ; vconv.fp32.bf16 cmh0, x2; nopv ; OPTIMISTIC-NEXT: .L_LEnd0: ; OPTIMISTIC-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; OPTIMISTIC-NEXT: // %bb.3: diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/Conv2D_bfp16_kernel_fixedslot.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/Conv2D_bfp16_kernel_fixedslot.mir index 3e3334a98a0a..e67fb3e63991 100644 --- a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/Conv2D_bfp16_kernel_fixedslot.mir +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/Conv2D_bfp16_kernel_fixedslot.mir @@ -14,18 +14,18 @@ ; CHECK-LABEL: conv2d_bfp16: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry - ; CHECK-NEXT: mova r25, #0; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: mova r25, #0; nopx ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25] ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25]; movx r24, #0 ; CHECK-NEXT: vlda.pop.576 ex0, [p1, lf1, r25]; vldb.fill.512 [p0, lf0, r24] ; CHECK-NEXT: vlda.pop.576.3d ex4, [p1, lf1, r25, d0]; vldb.pop.576 ex2, [p0, lf0, r24] ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25] ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25]; vldb.pop.576 ex6, [p0, lf0, r24, m1] - ; CHECK-NEXT: vlda.pop.576 ex0, [p1, lf1, r25]; vldb.fill.512 [p0, lf0, r24]; add.nc lc, r1, #-4 - ; CHECK-NEXT: vlda.pop.576.3d ex4, [p1, lf1, r25, d0]; vldb.pop.576 ex2, [p0, lf0, r24]; movxm ls, #.LBB0_1 - ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25]; movxm le, #.L_LEnd0 - ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25]; vldb.pop.576 ex6, [p0, lf0, r24, m1]; nops ; nopxm ; nopv - ; CHECK-NEXT: vlda.pop.576 ex0, [p1, lf1, r25]; vldb.fill.512 [p0, lf0, r24]; nops ; nopxm ; nopv + ; CHECK-NEXT: vlda.pop.576 ex0, [p1, lf1, r25]; vldb.fill.512 [p0, lf0, r24] + ; CHECK-NEXT: vlda.pop.576.3d ex4, [p1, lf1, r25, d0]; vldb.pop.576 ex2, [p0, lf0, r24] + ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25]; add.nc lc, r1, #-4 + ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25]; vldb.pop.576 ex6, [p0, lf0, r24, m1]; movxm ls, #.LBB0_1 + ; CHECK-NEXT: vlda.pop.576 ex0, [p1, lf1, r25]; vldb.fill.512 [p0, lf0, r24]; movxm le, #.L_LEnd0 ; CHECK-NEXT: vlda.pop.576.3d ex4, [p1, lf1, r25, d0]; vldb.pop.576 ex2, [p0, lf0, r24]; nops ; nopx ; vshuffle ex8, ex0, ex4, r4; nopv ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25]; nopb ; nops ; movx r0, #780; vshuffle ex10, ex0, ex4, r5; nopv ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25]; vldb.pop.576 ex6, [p0, lf0, r24, m1]; nops ; nopxm ; vmac.f dm0, dm0, ex8, ex2, r0 @@ -33,9 +33,9 @@ ; CHECK-NEXT: vlda.pop.576.3d ex4, [p1, lf1, r25, d0]; vldb.pop.576 ex2, [p0, lf0, r24]; nops ; nopx ; vshuffle ex8, ex0, ex4, r4; vmac.f dm2, dm2, ex8, ex6, r0 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25]; nopx ; vshuffle ex10, ex0, ex4, r5; vmac.f dm3, dm3, ex10, ex6, r0 - ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25]; vldb.pop.576 ex6, [p0, lf0, r24, m1]; vmac.f dm0, dm0, ex8, ex2, r0 - ; CHECK-NEXT: vlda.pop.576 ex0, [p1, lf1, r25]; vldb.fill.512 [p0, lf0, r24]; vmac.f dm1, dm1, ex10, ex2, r0 + ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25]; nopb ; nops ; nopx ; vshuffle ex10, ex0, ex4, r5; vmac.f dm3, dm3, ex10, ex6, r0 + ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25]; vldb.pop.576 ex6, [p0, lf0, r24, m1]; nops ; nopxm ; vmac.f dm0, dm0, ex8, ex2, r0 + ; CHECK-NEXT: vlda.pop.576 ex0, [p1, lf1, r25]; vldb.fill.512 [p0, lf0, r24]; nops ; nopxm ; vmac.f dm1, dm1, ex10, ex2, r0 ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: vlda.pop.576.3d ex4, [p1, lf1, r25, d0]; vldb.pop.576 ex2, [p0, lf0, r24]; nops ; nopx ; vshuffle ex8, ex0, ex4, r4; vmac.f dm2, dm2, ex8, ex6, r0 ; CHECK-NEXT: // %bb.2: diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/Conv2D_bfp16_kernel_multislot.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/Conv2D_bfp16_kernel_multislot.mir index 51c2c4f51a89..c41cc5d15ab2 100644 --- a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/Conv2D_bfp16_kernel_multislot.mir +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/Conv2D_bfp16_kernel_multislot.mir @@ -14,18 +14,18 @@ ; CHECK-LABEL: conv2d_bfp16: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry - ; CHECK-NEXT: mova r25, #0; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: mova r25, #0; nopx ; CHECK-NEXT: vldb.fill.512 [p1, lf1, r25] ; CHECK-NEXT: mova r24, #0; vldb.fill.512 [p1, lf1, r25] ; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.pop.576 ex0, [p1, lf1, r25] ; CHECK-NEXT: vlda.pop.576 ex2, [p0, lf0, r24]; vldb.pop.576.3d ex4, [p1, lf1, r25, d0] ; CHECK-NEXT: vldb.fill.512 [p1, lf1, r25] ; CHECK-NEXT: vlda.pop.576 ex6, [p0, lf0, r24, m1]; vldb.fill.512 [p1, lf1, r25] - ; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.pop.576 ex0, [p1, lf1, r25]; add.nc lc, r1, #-4 - ; CHECK-NEXT: vlda.pop.576 ex2, [p0, lf0, r24]; vldb.pop.576.3d ex4, [p1, lf1, r25, d0]; movxm ls, #.LBB0_1 - ; CHECK-NEXT: vldb.fill.512 [p1, lf1, r25]; movxm le, #.L_LEnd0 - ; CHECK-NEXT: vlda.pop.576 ex6, [p0, lf0, r24, m1]; vldb.fill.512 [p1, lf1, r25]; nops ; nopxm ; nopv - ; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.pop.576 ex0, [p1, lf1, r25]; nops ; nopxm ; nopv + ; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.pop.576 ex0, [p1, lf1, r25] + ; CHECK-NEXT: vlda.pop.576 ex2, [p0, lf0, r24]; vldb.pop.576.3d ex4, [p1, lf1, r25, d0] + ; CHECK-NEXT: vldb.fill.512 [p1, lf1, r25]; add.nc lc, r1, #-4 + ; CHECK-NEXT: vlda.pop.576 ex6, [p0, lf0, r24, m1]; vldb.fill.512 [p1, lf1, r25]; movxm ls, #.LBB0_1 + ; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.pop.576 ex0, [p1, lf1, r25]; movxm le, #.L_LEnd0 ; CHECK-NEXT: vlda.pop.576 ex2, [p0, lf0, r24]; vldb.pop.576.3d ex4, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle ex8, ex0, ex4, r4; nopv ; CHECK-NEXT: mova r0, #780; vldb.fill.512 [p1, lf1, r25]; nops ; nopx ; vshuffle ex10, ex0, ex4, r5; nopv ; CHECK-NEXT: vlda.pop.576 ex6, [p0, lf0, r24, m1]; vldb.fill.512 [p1, lf1, r25]; nops ; nopxm ; vmac.f dm0, dm0, ex8, ex2, r0 @@ -33,9 +33,9 @@ ; CHECK-NEXT: vlda.pop.576 ex2, [p0, lf0, r24]; vldb.pop.576.3d ex4, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle ex8, ex0, ex4, r4; vmac.f dm2, dm2, ex8, ex6, r0 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: vldb.fill.512 [p1, lf1, r25]; nopx ; vshuffle ex10, ex0, ex4, r5; vmac.f dm3, dm3, ex10, ex6, r0 - ; CHECK-NEXT: vlda.pop.576 ex6, [p0, lf0, r24, m1]; vldb.fill.512 [p1, lf1, r25]; vmac.f dm0, dm0, ex8, ex2, r0 - ; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.pop.576 ex0, [p1, lf1, r25]; vmac.f dm1, dm1, ex10, ex2, r0 + ; CHECK-NEXT: nopa ; vldb.fill.512 [p1, lf1, r25]; nops ; nopx ; vshuffle ex10, ex0, ex4, r5; vmac.f dm3, dm3, ex10, ex6, r0 + ; CHECK-NEXT: vlda.pop.576 ex6, [p0, lf0, r24, m1]; vldb.fill.512 [p1, lf1, r25]; nops ; nopxm ; vmac.f dm0, dm0, ex8, ex2, r0 + ; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.pop.576 ex0, [p1, lf1, r25]; nops ; nopxm ; vmac.f dm1, dm1, ex10, ex2, r0 ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: vlda.pop.576 ex2, [p0, lf0, r24]; vldb.pop.576.3d ex4, [p1, lf1, r25, d0]; nops ; nopx ; vshuffle ex8, ex0, ex4, r4; vmac.f dm2, dm2, ex8, ex6, r0 ; CHECK-NEXT: // %bb.2: diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/multiSlotAssignment/future_conflict_assignment.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/multiSlotAssignment/future_conflict_assignment.mir index 5dcb8b921eda..3d7974fe1e38 100644 --- a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/multiSlotAssignment/future_conflict_assignment.mir +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/multiSlotAssignment/future_conflict_assignment.mir @@ -48,7 +48,6 @@ ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.fill.512 [p1, lf1, r25]; nops ; nopxm ; nopv diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/multiSlotAssignment/non_overlapping_addrspace.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/multiSlotAssignment/non_overlapping_addrspace.mir index a0be6ced869a..4dbba171f09d 100644 --- a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/multiSlotAssignment/non_overlapping_addrspace.mir +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/multiSlotAssignment/non_overlapping_addrspace.mir @@ -42,14 +42,11 @@ ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25]; vldb.fill.512 [p0, lf0, r24]; nopx - ; CHECK-NEXT: vldb.fill.512 [p0, lf0, r24] - ; CHECK-NEXT: vldb.pop.576 ex6, [p0, lf0, r24] + ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25]; vldb.fill.512 [p0, lf0, r24]; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; vldb.fill.512 [p0, lf0, r24]; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; vldb.pop.576 ex6, [p0, lf0, r24]; nops ; nopxm ; nopv ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: nopa ; vldb.pop.576.3d ex1, [p0, lf0, r24, d3]; nops ; nopxm ; nopv ; CHECK-NEXT: // %bb.3: diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/multiSlotAssignment/partially_materialized.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/multiSlotAssignment/partially_materialized.mir index 28b033b8ae81..877bffe8cf81 100644 --- a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/multiSlotAssignment/partially_materialized.mir +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/multiSlotAssignment/partially_materialized.mir @@ -40,12 +40,10 @@ ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // =>This Inner Loop Header: Depth=1 - ; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.fill.512 [p1, lf1, r25]; nopm - ; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.fill.512 [p1, lf1, r25] + ; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.fill.512 [p1, lf1, r25]; nops ; nopxm ; nopv + ; CHECK-NEXT: vlda.fill.512 [p0, lf0, r24]; vldb.fill.512 [p1, lf1, r25]; nops ; nopxm ; nopv ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: vlda.pop.576 ex6, [p0, lf0, r24]; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: // %bb.3: diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/multiSlotAssignment/reassign_slots.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/multiSlotAssignment/reassign_slots.mir index 576d53378c16..b46a8828a828 100644 --- a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/multiSlotAssignment/reassign_slots.mir +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/multiSlotAssignment/reassign_slots.mir @@ -39,7 +39,6 @@ ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_2: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vlda.fill.512 [p1, lf1, r25]; vldb.fill.512 [p0, lf0, r24]; nops ; nopxm ; nopv diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/softmax-aa.mir b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/softmax-aa.mir index 7555ab84f037..3347860cdff4 100644 --- a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/softmax-aa.mir +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/softmax-aa.mir @@ -32,14 +32,14 @@ ; CHECK-NEXT: .LBB0_2: // %for.cond2.preheader ; CHECK-NEXT: // =>This Loop Header: Depth=1 ; CHECK-NEXT: // Child Loop BB0_3 Depth 2 - ; CHECK-NEXT: vldb wl2, [p1], #32; nopxm + ; CHECK-NEXT: nopa ; vldb wl2, [p1], #32; nops ; nopxm ; nopv + ; CHECK-NEXT: nopx ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: vldb wl2, [p1], #32 ; CHECK-NEXT: add.nc lc, r1, #-3 ; CHECK-NEXT: movxm ls, #.LBB0_3 - ; CHECK-NEXT: vldb wl2, [p1], #32; movxm le, #.L_LEnd0 - ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; vmul.f dm0, x2, x0, r2 + ; CHECK-NEXT: movxm le, #.L_LEnd0; vmul.f dm0, x2, x0, r2 ; CHECK-NEXT: nopa ; vldb wl2, [p1], #32; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv @@ -48,9 +48,9 @@ ; CHECK-NEXT: .LBB0_3: // %for.body5 ; CHECK-NEXT: // Parent Loop BB0_2 Depth=1 ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 - ; CHECK-NEXT: vldb wl2, [p1], #32; nopxm - ; CHECK-NEXT: vst.conv.bf16.fp32 bmll0, [p0], #32 - ; CHECK-NEXT: nop + ; CHECK-NEXT: nopa ; vldb wl2, [p1], #32; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; vst.conv.bf16.fp32 bmll0, [p0], #32; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; vmul.f dm0, x2, x0, r2 ; CHECK-NEXT: // %bb.4: // %for.cond.cleanup4 @@ -140,16 +140,16 @@ ; CHECK-NEXT: .LBB1_2: // %for.cond2.preheader ; CHECK-NEXT: // =>This Loop Header: Depth=1 ; CHECK-NEXT: // Child Loop BB1_3 Depth 2 - ; CHECK-NEXT: padda [p1], m0 + ; CHECK-NEXT: padda [p1], m0; nopb ; nopxm ; CHECK-NEXT: nop ; CHECK-NEXT: vldb wl2, [p1], #32 ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: nop + ; CHECK-NEXT: vldb wl2, [p1], #32 ; CHECK-NEXT: add.nc lc, r1, #-3 ; CHECK-NEXT: movxm ls, #.LBB1_3 - ; CHECK-NEXT: vldb wl2, [p1], #32; movxm le, #.L_LEnd1 - ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; vmul.f dm0, x2, x0, r2 + ; CHECK-NEXT: movxm le, #.L_LEnd1; vmul.f dm0, x2, x0, r2 ; CHECK-NEXT: nopa ; vldb wl2, [p1], #32; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv @@ -158,9 +158,9 @@ ; CHECK-NEXT: .LBB1_3: // %for.body5 ; CHECK-NEXT: // Parent Loop BB1_2 Depth=1 ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 - ; CHECK-NEXT: vldb wl2, [p1], #32; nopxm - ; CHECK-NEXT: vst.conv.bf16.fp32 bmll0, [p0], #32 - ; CHECK-NEXT: nop + ; CHECK-NEXT: nopa ; vldb wl2, [p1], #32; nops ; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; vst.conv.bf16.fp32 bmll0, [p0], #32; nopxm ; nopv + ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: .L_LEnd1: ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; vmul.f dm0, x2, x0, r2 ; CHECK-NEXT: // %bb.4: // %for.cond.cleanup4 @@ -255,23 +255,16 @@ ; CHECK-NEXT: add.nc lc, r1, #0 ; CHECK-NEXT: movxm ls, #.LBB2_3 ; CHECK-NEXT: movxm le, #.L_LEnd2 + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB2_3: // %for.body5 + ; CHECK-NEXT: // Parent Loop BB2_2 Depth=1 + ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 + ; CHECK-NEXT: nopa ; vldb wl2, [p1], #32; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv - ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv - ; CHECK-NEXT: .p2align 4 - ; CHECK-NEXT: .LBB2_3: // %for.body5 - ; CHECK-NEXT: // Parent Loop BB2_2 Depth=1 - ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 - ; CHECK-NEXT: vldb wl2, [p1], #32; nopx - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vmul.f dm0, x2, x0, r2 ; CHECK-NEXT: nop @@ -360,26 +353,19 @@ ; CHECK-NEXT: .LBB3_2: // %for.cond2.preheader ; CHECK-NEXT: // =>This Loop Header: Depth=1 ; CHECK-NEXT: // Child Loop BB3_3 Depth 2 - ; CHECK-NEXT: add.nc lc, r1, #0 + ; CHECK-NEXT: nopa ; nopb ; nops ; nopx ; add.nc lc, r1, #0; nopv ; CHECK-NEXT: movxm ls, #.LBB3_3 - ; CHECK-NEXT: movxm le, #.L_LEnd3 - ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv + ; CHECK-NEXT: padda [p1], m0; movxm le, #.L_LEnd3 + ; CHECK-NEXT: .p2align 4 + ; CHECK-NEXT: .LBB3_3: // %for.body5 + ; CHECK-NEXT: // Parent Loop BB3_2 Depth=1 + ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 + ; CHECK-NEXT: nopa ; vldb wl2, [p1], #32; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: nopa ; nopb ; nops ; nopxm ; nopv - ; CHECK-NEXT: padda [p1], m0; nopb ; nops ; nopxm ; nopv - ; CHECK-NEXT: .p2align 4 - ; CHECK-NEXT: .LBB3_3: // %for.body5 - ; CHECK-NEXT: // Parent Loop BB3_2 Depth=1 - ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 - ; CHECK-NEXT: vldb wl2, [p1], #32; nopx - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop - ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: vmul.f dm0, x2, x0, r2 ; CHECK-NEXT: nop