Skip to content

[AIE] Improve Zero-overhead loop 112-bytes padding #440

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: aie-public
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 55 additions & 6 deletions llvm/lib/Target/AIE/AIEBaseInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1046,20 +1046,57 @@ const PacketFormats &AIEBaseInstrInfo::getPacketFormats() const {
return FormatInterface->getPacketFormats();
}

unsigned
AIEBaseInstrInfo::getZOLBundlesCount(const MachineBasicBlock &MBB) const {
auto First = MBB.getFirstNonDebugInstr(false);
auto Last = MBB.getLastNonDebugInstr(false);

// If MBB is empty or has no non-debug instructions, return 0.
if (First == MBB.end() || Last == MBB.end())
return 0;

if (isHardwareLoopEnd(Last->getOpcode()))
return std::distance(First, Last);

return 0;
}

bool AIEBaseInstrInfo::isZOLBody(const MachineBasicBlock &MBB) const {
auto Last = MBB.getLastNonDebugInstr(false);

// If MBB is empty or has no non-debug instructions, return false.
if (Last == MBB.end())
return false;

return isHardwareLoopEnd(Last->getOpcode());
}

std::vector<MachineBasicBlock::iterator>
AIEBaseInstrInfo::getAlignmentBoundaries(MachineBasicBlock &MBB) const {
std::vector<MachineBasicBlock::iterator> AlgnCandidates;
unsigned DelaySlot = 0;

unsigned DelaySlot = 0;
// LoopSetupDistance will be set to number of instructions (7). In
// PostRAScheduler, this is enforced by setting the exit latency in the
// schduler dag mutator
// schduler dag mutator.
unsigned LoopSetupDistance = 0;
unsigned ZOLBundlesCount = 0;
unsigned ZOLBodyRegionsCount = 0;
bool IsCall = false;
auto ZOLSupport = getZOLSupport();
bool IsZOLBody = isZOLBody(MBB);
if (IsZOLBody) {
// Exclude the LoopEnd bundle since it must reside in its own
// standalone region to ensure it points to a 128-bit aligned instruction.
ZOLBundlesCount = getZOLBundlesCount(MBB) - 1;
if (ZOLBundlesCount < ZOLSupport->LoopSetupDistance)
ZOLBodyRegionsCount = ZOLBundlesCount;
else
ZOLBodyRegionsCount = ZOLSupport->LoopSetupDistance;
}
for (auto MI = MBB.begin(), End = MBB.end(); MI != End; ++MI) {
if (MI->isBundle()) {
// Return Address Candidate
// Return Address Candidate.
IsCall = isCallBundle(MI);
if (IsCall && DelaySlot > 0)
llvm_unreachable("Cannot have branch in branch delay slot!\n");
Expand All @@ -1079,14 +1116,26 @@ AIEBaseInstrInfo::getAlignmentBoundaries(MachineBasicBlock &MBB) const {
LoopSetupDistance--;
}

if (IsZOLBody && ZOLBodyRegionsCount > 0) {
AlgnCandidates.emplace_back(MI);
ZOLBodyRegionsCount--;
}

if (IsCall)
DelaySlot = getNumDelaySlots(*MI);

// Distance in terms of fully-expanded 128-bit bundles that
// loop setup should maintain. We force each of these bundles to an
// alignment boundary, so that they will occupy 16 bytes.
if (ZOLSupport && isZOLSetupBundle(MI) && isLastZOLSetupBundleInMBB(MI))
LoopSetupDistance = ZOLSupport->LoopSetupDistance;
if (ZOLSupport && isZOLSetupBundle(MI) && isLastZOLSetupBundleInMBB(MI)) {
// if we have only one MBB, it must be the loop.
if (MBB.succ_size() == 1) {
const MachineBasicBlock *LoopSucc = *MBB.successors().begin();
ZOLBundlesCount = getZOLBundlesCount(*LoopSucc) - 1;
}
if (ZOLBundlesCount < ZOLSupport->LoopSetupDistance)
LoopSetupDistance = ZOLSupport->LoopSetupDistance - ZOLBundlesCount;
}
} else if (isHardwareLoopEnd(MI->getOpcode())) {
if (DelaySlot > 0)
llvm_unreachable("Cannot have HWLoopEnd in branch delay slot!\n");
Expand All @@ -1095,7 +1144,7 @@ AIEBaseInstrInfo::getAlignmentBoundaries(MachineBasicBlock &MBB) const {
AlgnCandidates.emplace_back(std::prev(MI));
} else if (!MI->isMetaInstruction()) {
// single instruction, there should not be any
// after Bundle Finalization Pass
// after Bundle Finalization Pass.
llvm_unreachable("Found an un-expected standalone instruction !");
}
}
Expand Down
5 changes: 4 additions & 1 deletion llvm/lib/Target/AIE/AIEBaseInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ namespace llvm {

struct AIEBaseInstrInfo : public TargetInstrInfo {
using TargetInstrInfo::TargetInstrInfo;

// This codifies the model of ZeroOverheadLoops
class ZOLSupport {
public:
Expand Down Expand Up @@ -313,6 +312,10 @@ struct AIEBaseInstrInfo : public TargetInstrInfo {
// registers(lc, le, ls, etc.) and the end of the loop,
virtual unsigned getLoopSetupDistance() const;

virtual unsigned getZOLBundlesCount(const MachineBasicBlock &MBB) const;

bool isZOLBody(const MachineBasicBlock &MBB) const;

// Return the vector of Alignment Region Boundaries.
virtual std::vector<MachineBasicBlock::iterator>
getAlignmentBoundaries(MachineBasicBlock &MBB) const;
Expand Down
15 changes: 13 additions & 2 deletions llvm/lib/Target/AIE/AIEBaseSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,8 @@ class RegionEndEdges : public ScheduleDAGMutation {
}
void apply(ScheduleDAGInstrs *DAG) override {
AIE::MaxLatencyFinder MaxLatency(DAG);
MachineBasicBlock *PrologueMBB = DAG->getBB();
unsigned int ZOLBundlesCount = 0;

// Default edges to ExitSU are conservative, and can't be shrunk.
// We really should know what we're doing here, so just remove and
Expand Down Expand Up @@ -296,9 +298,18 @@ class RegionEndEdges : public ScheduleDAGMutation {
if (TII->isZeroOverheadLoopSetupInstr(MI)) {
auto ZOLSupport = TII->getZOLSupport();
assert(ZOLSupport);
EdgeLatency = std::max(EdgeLatency, ZOLSupport->LoopSetupDistance + 1);
if (PrologueMBB && PrologueMBB->succ_size() == 1) {
// if we have only one MBB, it must be the loop.
MachineBasicBlock *LoopSucc = *PrologueMBB->successors().begin();
// Exclude the LoopEnd bundle since it must reside in its own
// standalone region to ensure it points to a 128-bit aligned
// instruction.
ZOLBundlesCount = TII->getZOLBundlesCount(*LoopSucc) - 1;
}
if (ZOLBundlesCount < ZOLSupport->LoopSetupDistance)
EdgeLatency = std::max(EdgeLatency, ZOLSupport->LoopSetupDistance +
1 - ZOLBundlesCount);
}

ExitDep.setLatency(EdgeLatency);
DAG->ExitSU.addPred(ExitDep, /*Required=*/true);
}
Expand Down
61 changes: 18 additions & 43 deletions llvm/test/CodeGen/AIE/aie2/elongate/zol_112bytes_elongate2.mir
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ body: |
; CHECK-NEXT: NOPV
; CHECK-NEXT: }
; CHECK-NEXT: BUNDLE implicit-def $r3 {
; CHECK-NEXT: NOPA
; CHECK-NEXT: NOPX
; CHECK-NEXT: renamable $r3 = MOV_mv_cg 2
; CHECK-NEXT: }
Expand All @@ -36,66 +37,45 @@ body: |
; CHECK-NEXT: $ls = MOVXM_lng_cg %bb.1
; CHECK-NEXT: }
; CHECK-NEXT: BUNDLE {
; CHECK-NEXT: NOPB
; CHECK-NEXT: NOPA
; CHECK-NEXT: NOPS
; CHECK-NEXT: NOPXM
; CHECK-NEXT: NOPV
; CHECK-NEXT: NOP
; CHECK-NEXT: }
; CHECK-NEXT: BUNDLE {
; CHECK-NEXT: NOPB
; CHECK-NEXT: NOPA
; CHECK-NEXT: NOPS
; CHECK-NEXT: NOPXM
; CHECK-NEXT: NOPV
; CHECK-NEXT: NOP
; CHECK-NEXT: }
; CHECK-NEXT: BUNDLE {
; CHECK-NEXT: NOPB
; CHECK-NEXT: NOPA
; CHECK-NEXT: NOPS
; CHECK-NEXT: NOPXM
; CHECK-NEXT: NOPV
; CHECK-NEXT: NOP
; CHECK-NEXT: }
; CHECK-NEXT: BUNDLE {
; CHECK-NEXT: NOPB
; CHECK-NEXT: NOPA
; CHECK-NEXT: NOPS
; CHECK-NEXT: NOPXM
; CHECK-NEXT: NOPV
; CHECK-NEXT: NOP
; CHECK-NEXT: }
; CHECK-NEXT: BUNDLE {
; CHECK-NEXT: NOPB
; CHECK-NEXT: NOPA
; CHECK-NEXT: NOPS
; CHECK-NEXT: NOPXM
; CHECK-NEXT: NOPV
; CHECK-NEXT: NOP
; CHECK-NEXT: }
; CHECK-NEXT: BUNDLE {
; CHECK-NEXT: NOPB
; CHECK-NEXT: NOPA
; CHECK-NEXT: NOPS
; CHECK-NEXT: NOPXM
; CHECK-NEXT: NOPV
; CHECK-NEXT: NOP
; CHECK-NEXT: }
; CHECK-NEXT: BUNDLE {
; CHECK-NEXT: NOPB
; CHECK-NEXT: NOPA
; CHECK-NEXT: NOPS
; CHECK-NEXT: NOPXM
; CHECK-NEXT: NOPV
; CHECK-NEXT: NOP
; CHECK-NEXT: }
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1 (align 16):
; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000)
; CHECK-NEXT: liveins: $p0, $p1, $r0, $r1, $r2, $r3
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: BUNDLE implicit-def $r4, implicit $r2, implicit $r3 {
; CHECK-NEXT: NOPB
; CHECK-NEXT: NOPA
; CHECK-NEXT: NOPS
; CHECK-NEXT: renamable $r4 = LSHL renamable $r2, renamable $r3
; CHECK-NEXT: NOPM
; CHECK-NEXT: NOPV
; CHECK-NEXT: }
; CHECK-NEXT: BUNDLE implicit-def $le {
; CHECK-NEXT: NOPB
; CHECK-NEXT: NOPA
; CHECK-NEXT: NOPS
; CHECK-NEXT: $le = MOVXM_lng_cg <mcsymbol .L_1120>
; CHECK-NEXT: NOPV
; CHECK-NEXT: }
; CHECK-NEXT: BUNDLE implicit-def $dj0, implicit killed $r4 {
; CHECK-NEXT: NOPB
Expand Down Expand Up @@ -127,18 +107,13 @@ body: |
; CHECK-NEXT: NOPV
; CHECK-NEXT: }
; CHECK-NEXT: BUNDLE {
; CHECK-NEXT: NOPB
; CHECK-NEXT: NOPA
; CHECK-NEXT: NOPS
; CHECK-NEXT: NOPB
; CHECK-NEXT: NOPXM
; CHECK-NEXT: NOPV
; CHECK-NEXT: NOPS
; CHECK-NEXT: }
; CHECK-NEXT: BUNDLE {
; CHECK-NEXT: NOPB
; CHECK-NEXT: NOPA
; CHECK-NEXT: NOPS
; CHECK-NEXT: NOPXM
; CHECK-NEXT: NOPV
; CHECK-NEXT: NOP
; CHECK-NEXT: }
; CHECK-NEXT: BUNDLE {
; CHECK-NEXT: NOPB
Expand Down
33 changes: 29 additions & 4 deletions llvm/test/CodeGen/AIE/aie2/elongate/zol_elongate1.mir
Original file line number Diff line number Diff line change
Expand Up @@ -69,22 +69,47 @@ body: |
; CHECK-NEXT: NOPV
; CHECK-NEXT: }
; CHECK-NEXT: BUNDLE implicit-def $dj0, implicit killed $r4 {
; CHECK-NEXT: NOPB
; CHECK-NEXT: NOPA
; CHECK-NEXT: NOPS
; CHECK-NEXT: NOPX
; CHECK-NEXT: $dj0 = MOV_mv_scl killed $r4
; CHECK-NEXT: NOPV
; CHECK-NEXT: }
; CHECK-NEXT: BUNDLE implicit-def $r4, implicit $p1, implicit killed $dj0 {
; CHECK-NEXT: NOPB
; CHECK-NEXT: renamable $r4 = LDA_dms_lda_idx renamable $p1, killed renamable $dj0 :: (load (s32) from %ir.arrayidx)
; CHECK-NEXT: NOPS
; CHECK-NEXT: NOPXM
; CHECK-NEXT: NOPV
; CHECK-NEXT: }
; CHECK-NEXT: BUNDLE {
; CHECK-NEXT: NOP
; CHECK-NEXT: NOPB
; CHECK-NEXT: NOPA
; CHECK-NEXT: NOPS
; CHECK-NEXT: NOPXM
; CHECK-NEXT: NOPV
; CHECK-NEXT: }
; CHECK-NEXT: BUNDLE {
; CHECK-NEXT: NOP
; CHECK-NEXT: NOPB
; CHECK-NEXT: NOPA
; CHECK-NEXT: NOPS
; CHECK-NEXT: NOPXM
; CHECK-NEXT: NOPV
; CHECK-NEXT: }
; CHECK-NEXT: BUNDLE {
; CHECK-NEXT: NOP
; CHECK-NEXT: NOPB
; CHECK-NEXT: NOPA
; CHECK-NEXT: NOPS
; CHECK-NEXT: NOPXM
; CHECK-NEXT: NOPV
; CHECK-NEXT: }
; CHECK-NEXT: BUNDLE {
; CHECK-NEXT: NOP
; CHECK-NEXT: NOPB
; CHECK-NEXT: NOPA
; CHECK-NEXT: NOPS
; CHECK-NEXT: NOPXM
; CHECK-NEXT: NOPV
; CHECK-NEXT: }
; CHECK-NEXT: BUNDLE {
; CHECK-NEXT: NOP
Expand Down
Loading