Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 19 additions & 6 deletions llvm/lib/Target/AIE/AIEInterBlockScheduling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -618,7 +618,14 @@ SchedulingStage InterBlockScheduling::updateScheduling(BlockState &BS) {
if (BS.getRegions().size() == 1) {
auto &PostSWP = BS.getPostSWP();
if (PostSWP.isPostPipelineCandidate(*BS.TheBlock)) {
BS.FixPoint.II = PostSWP.getResMII(*BS.TheBlock);
// A CLI --aie-postpipeliner-target-ii is a hard limit: start at
// exactly that II (bypassing --aie-postpipeliner-maxii) and let
// updatePipelining one-shot it. A pragma-driven TargetII is a soft
// hint: start at ResMII and iterate normally; the solver fallback at
// II == TargetII is handled inside the post-pipeliner.
BS.FixPoint.II = PostSWP.isTargetIIHardLimit()
? PostSWP.getTargetII()
: PostSWP.getResMII(*BS.TheBlock);
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should simplify. When we drive an example from the command line, we want to say where we start, where we stop. Orthogonal to that, we want to say which approaches to enable. In my branch I have introduced a MinII CLI.

BS.FixPoint.IITries = 1;
return SchedulingStage::Pipelining;
}
Expand All @@ -632,11 +639,17 @@ SchedulingStage InterBlockScheduling::updatePipelining(BlockState &BS) {
return BS.FixPoint.Stage;
}

// Otherwise try a larger II.
// We cut off at larger IIs to prevent excessive compilation time.
if (++BS.FixPoint.II <= PostPipelinerMaxII &&
++BS.FixPoint.IITries <= PostPipelinerMaxTryII) {
return SchedulingStage::Pipelining;
// A CLI --aie-postpipeliner-target-ii is one-shot: try only the requested
// II, even if it exceeds --aie-postpipeliner-maxii. If that attempt
// failed, do not try any other II. A pragma-driven TargetII keeps the
// normal iteration (ResMII..MaxII).
if (!BS.getPostSWP().isTargetIIHardLimit()) {
// Otherwise try a larger II.
// We cut off at larger IIs to prevent excessive compilation time.
if (++BS.FixPoint.II <= PostPipelinerMaxII &&
++BS.FixPoint.IITries <= PostPipelinerMaxTryII) {
return SchedulingStage::Pipelining;
}
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, a one-shot attempt can be build from MinII and MaxII and enabling the algorithms that you want to act on it.

}

auto *BB = BS.TheBlock;
Expand Down
97 changes: 70 additions & 27 deletions llvm/lib/Target/AIE/AIEPostPipeliner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,16 @@ static cl::opt<int>
cl::desc("Number of runs for heuristics that converge"),
cl::init(20), cl::Hidden);

static cl::opt<int> PresetII("aie-postpipeliner-target-ii",
cl::desc("II for which to allow the solver"),
cl::init(0), cl::Hidden);
static cl::opt<bool>
UseSolver("aie-postpipeliner-solver",
cl::desc("Use the solver as fallback after heuristics fail"),
cl::init(false), cl::Hidden);

static cl::opt<int>
PresetII("aie-postpipeliner-target-ii",
cl::desc("Run solver-only at this II; bypasses MaxII and "
"skips heuristics"),
cl::init(0), cl::Hidden);

PipelineScheduleVisitor::~PipelineScheduleVisitor() {}

Expand Down Expand Up @@ -160,16 +167,35 @@ bool PostPipeliner::isPostPipelineCandidate(MachineBasicBlock &LoopBlock) {
return false;
}

if (PresetII) {
TargetII = PresetII;
// No solver backend compiled in: TargetII/--aie-postpipeliner-solver
// are no-ops. Keep pre-commit behavior (heuristics only).
if (!Solver::hasSolver()) {
const bool AnyRequest =
PresetII || UseSolver || getInitiationInterval(getLoopID(LoopBlock));
if (AnyRequest) {
DEBUG_SUMMARY(
dbgs() << " PostPipeliner: ignoring TargetII/solver request, "
"no solver compiled in\n");
}
return true;
}
auto ParsedInitiationInterval = getInitiationInterval(getLoopID(LoopBlock));
if (ParsedInitiationInterval) {
TargetII = *ParsedInitiationInterval;
DEBUG_SUMMARY(dbgs() << " PostPipeliner: TargetII=" << TargetII << "\n");

// --aie-postpipeliner-target-ii: hard one-shot. Bypasses MaxII and
// skips heuristics; only the solver runs at exactly this II.
if (PresetII) {
TargetII = PresetII;
TargetIIIsHardLimit = true;
} else if (!UseSolver) {
// Pragma soft hint: heuristics iterate normally and the solver runs
// at II == TargetII. --aie-postpipeliner-solver overrides this.
if (const auto Pragma = getInitiationInterval(getLoopID(LoopBlock)))
TargetII = *Pragma;
}

if (TargetII)
DEBUG_SUMMARY(dbgs() << " PostPipeliner: TargetII=" << TargetII
<< (TargetIIIsHardLimit ? " (hard)" : " (soft)")
<< "\n");
return true;
}

Expand Down Expand Up @@ -1431,8 +1457,7 @@ static const ConfigStrategy::Configuration Heuristics[] = {
{1, false, false, 1, {Prio::NodeNum}, {}}, // pure bottom up
};

bool PostPipeliner::tryApproaches() {
DEBUG_SUMMARY(dbgs() << "-- MinLength=" << MinLength << "\n");
bool PostPipeliner::runHeuristics() {
int HeuristicIndex = 0;
for (const auto &Config : Heuristics) {
if (Heuristic >= 0 && Heuristic != HeuristicIndex++) {
Expand All @@ -1459,27 +1484,45 @@ bool PostPipeliner::tryApproaches() {
}
DEBUG_SUMMARY(dbgs() << " Strategy " << S.name() << " failed\n");
}
// Last-chance heuristic: relax the iteration-count constraint.
IterCountSlackStrategy Relaxed(*DAG, Info, MinLength + II);
resetSchedule(/*FullReset=*/true);
if (scheduleWithStrategy(Relaxed)) {
return scheduleWithStrategy(Relaxed);
}

bool PostPipeliner::runSolverFallback() {
const SolverData Data = createSolverData();
const int NS = MinLength / II;
if (solve(Data, NS, false)) {
return true;
}

// TargetII is the OK from the user to spend some time reaching this II.
// Therefore, if we haven't found a solution yet, bring in the big guns.
if (II == TargetII) {
const SolverData Data = createSolverData();
int NS = MinLength / II;
if (solve(Data, NS, false)) {
return true;
}
if (NS == MinTripCount) {
// Only try this at the boundary case
if (solve(Data, NS + 1, true)) {
return true;
}
}
// Let's try SEF solution.
if (solve(Data, NS + 1, true)) {
return true;
}
// Marsshot: last try with full NS + 1.
return solve(Data, NS + 1, false);
}

bool PostPipeliner::tryApproaches() {
DEBUG_SUMMARY(dbgs() << "-- MinLength=" << MinLength << "\n");

// CLI --aie-postpipeliner-target-ii: solver-only, skip heuristics.
const bool SolverOnly = TargetIIIsHardLimit;
const bool RunHeuristics = !SolverOnly;

// Solver runs at this II if the user asked for solver fallback at every
// II, or this II matches a TargetII (CLI hard or pragma soft hint).
const bool SolverAtThisII =
UseSolver || SolverOnly || (TargetII != 0 && II == TargetII);
// Belt-and-braces re-check: never call solve() with no backend, even
// though isPostPipelineCandidate already filtered the request out.
const bool RunSolver = Solver::hasSolver() && SolverAtThisII;

if (RunHeuristics && runHeuristics())
return true;
if (RunSolver && runSolverFallback())
return true;

DEBUG_SUMMARY(dbgs() << "=== II=" << II << " Failed ===\n");
return false;
Expand Down
25 changes: 22 additions & 3 deletions llvm/lib/Target/AIE/AIEPostPipeliner.h
Original file line number Diff line number Diff line change
Expand Up @@ -247,11 +247,14 @@ class PostPipeliner {
/// The minimum tripcount, read from the pragma, or from an LC initialization.
int MinTripCount = 0;

/// The II requested by a pragma. This will trigger expensive algorithms
/// like solvers or exhaustive searches to be run if the heuristic methods
/// don't find a solution.
/// User/pragma-requested II at which the solver is additionally run.
/// Stays 0 when no solver backend is compiled in.
int TargetII = 0;

/// True when TargetII is a hard CLI one-shot (skip heuristics, bypass
/// MaxII), false when it's a soft pragma hint.
bool TargetIIIsHardLimit = false;

/// The Preheader of the loop.
MachineBasicBlock *Preheader = nullptr;

Expand Down Expand Up @@ -316,6 +319,14 @@ class PostPipeliner {
/// If it returns true, a valid schedule is laid down in Info.
bool tryApproaches();

/// Run the heuristic strategies (each ConfigStrategy plus the relaxed
/// IterCountSlackStrategy fallback) at the current II.
bool runHeuristics();

/// Run the solver-based last-resort attempts at the current II:
/// (NS, !SEF), (NS+1, SEF), (NS+1, !SEF).
bool runSolverFallback();

/// Find the first available unscheduled instruction with the highest
/// priority.
int mostUrgent(PostPipelinerStrategy &Strategy);
Expand Down Expand Up @@ -346,6 +357,14 @@ class PostPipeliner {
/// \pre isPostPipelineCandidate has returned true
int getResMII(MachineBasicBlock &LoopBlock);

/// Return the user/pragma-requested II, or 0 if none was set.
/// \pre isPostPipelineCandidate has returned true
int getTargetII() const { return TargetII; }

/// True when TargetII is a hard CLI one-shot (vs a soft pragma hint).
/// \pre isPostPipelineCandidate has returned true
bool isTargetIIHardLimit() const { return TargetIIIsHardLimit; }

// Schedule using the given InitiationInterval. Return true when successful.
// In that case calls to the query methods below are legitimate.
bool schedule(ScheduleDAGMI &DAG, int InitiationInterval,
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Target/AIE/AIESWPSolver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,14 @@ std::vector<std::unique_ptr<SWPSolver>> getSolvers() {
return Solvers;
}

bool hasSolver() {
#if LLVM_WITH_Z3
return true;
#else
return false;
#endif // LLVM_WITH_Z3
}

Slot &SolverData::addSlot(int N) {
auto It = Slots.emplace(N, Slot(N)).first;
return It->second;
Expand Down
7 changes: 6 additions & 1 deletion llvm/lib/Target/AIE/AIESWPSolver.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates
// (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates
//
//===----------------------------------------------------------------------===//

Expand Down Expand Up @@ -183,6 +183,11 @@ class SWPSolver {
// Return the set of solvers to try
std::vector<std::unique_ptr<SWPSolver>> getSolvers();

/// Return true if at least one SWP solver backend is compiled into this
/// build. When false, getSolvers() returns an empty vector and any code
/// path that depends on the solver must fall back gracefully.
bool hasSolver();

#if LLVM_WITH_Z3
class Z3Solver : public SWPSolver {
protected:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates
# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates

# REQUIRES: enable_z3_solver

# RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 %s \
# RUN: --start-before=postmisched \
# RUN: --aie-postpipeliner-heuristic-runs=1 \
# RUN: --aie-postpipeliner-solver \
# RUN: -o - | FileCheck %s


Expand All @@ -22,65 +23,64 @@
; CHECK-LABEL: gemm:
; CHECK: .p2align 4
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: nopa ; vldb x4, [p7, #64]; nopxm
; CHECK-NEXT: vldb x4, [p7, #64]; nopx
; CHECK-NEXT: vldb.3d x7, [p7], d0; movs p4, p7
; CHECK-NEXT: paddb [p4], m4
; CHECK-NEXT: vldb x9, [p4, #0]
; CHECK-NEXT: vldb x5, [p4, #64]
; CHECK-NEXT: vldb x9, [p4, #0]
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: vldb x4, [p7, #64]; mov p5, p6
; CHECK-NEXT: movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x6, x7, x4, r0
; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; vshuffle x7, x7, x4, r1
; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x9, [p4, #0]; vshuffle x8, x9, x5, r0
; CHECK-NEXT: vldb x5, [p4, #64]; vshuffle x9, x9, x5, r1
; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x5, [p4, #64]; vshuffle x8, x9, x5, r0
; CHECK-NEXT: vldb x9, [p4, #0]; vshuffle x9, x9, x5, r1
; CHECK-NEXT: padda [p5], m5; add.nc lc, r0, #-3; vmul.f dm4, y3, y5, r2
; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; movxm ls, #.LBB0_1; vmul.f dm4, y4, y5, r2
; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; movxm le, #.L_LEnd0
; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; movxm ls, #.LBB0_1; vmul.f dm4, y4, y5, r2
; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; movxm le, #.L_LEnd0
; CHECK-NEXT: vldb x4, [p7, #64]; mov p5, p6
; CHECK-NEXT: movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x6, x7, x4, r0
; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; vconv.bfp16ebs8.fp32 ex0, dm4; vshuffle x7, x7, x4, r1
; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x9, [p4, #0]; vconv.bfp16ebs8.fp32 ex1, dm4; vshuffle x8, x9, x5, r0
; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4; vldb x5, [p4, #64]; vshuffle x9, x9, x5, r1
; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x5, [p4, #64]; vconv.bfp16ebs8.fp32 ex1, dm4; vshuffle x8, x9, x5, r0
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_1: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: padda [p5], m5; nopb ; nops ; nopxm ; vmul.f dm4, y3, y5, r2
; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2
; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]
; CHECK-NEXT: vldb x4, [p7, #64]; mov p5, p6
; CHECK-NEXT: movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x6, x7, x4, r0; vmac.f dm3, dm3, ex0, ex1, r3
; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; vconv.bfp16ebs8.fp32 ex0, dm4; nopx ; vshuffle x7, x7, x4, r1; vmac.f dm0, dm0, ex2, ex3, r3
; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x9, [p4, #0]; vconv.bfp16ebs8.fp32 ex1, dm4; nopx ; vshuffle x8, x9, x5, r0; vmac.f dm1, dm1, ex2, ex1, r3
; CHECK-NEXT: nopa ; vldb x9, [p4, #0]; vconv.bfp16ebs8.fp32 ex3, dm4; nopx ; vshuffle x9, x9, x5, r1; nopv
; CHECK-NEXT: padda [p5], m5; nopb ; nopx ; vmul.f dm4, y3, y5, r2
; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2
; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]
; CHECK-NEXT: vldb x4, [p7, #64]; mov p5, p6; vmac.f dm3, dm3, ex0, ex1, r3
; CHECK-NEXT: movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x6, x7, x4, r0; vmac.f dm2, dm2, ex0, ex3, r3
; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; vconv.bfp16ebs8.fp32 ex0, dm4; nopx ; vshuffle x7, x7, x4, r1; vmac.f dm1, dm1, ex2, ex1, r3
; CHECK-NEXT: .L_LEnd0:
; CHECK-NEXT: nopa ; vldb x5, [p4, #64]; vconv.bfp16ebs8.fp32 ex3, dm4; nopx ; vshuffle x9, x9, x5, r1; vmac.f dm2, dm2, ex0, ex3, r3
; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x5, [p4, #64]; vconv.bfp16ebs8.fp32 ex1, dm4; nopx ; vshuffle x8, x9, x5, r0; vmac.f dm0, dm0, ex2, ex3, r3
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: // %bb.2:
; CHECK-NEXT: padda [p5], m5; nopb ; nops ; nopxm ; vmul.f dm4, y3, y5, r2
; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; nopb ; nopx ; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2
; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]
; CHECK-NEXT: mov p5, p6
; CHECK-NEXT: vshuffle x6, x7, x4, r0; vmac.f dm3, dm3, ex0, ex1, r3
; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; vconv.bfp16ebs8.fp32 ex0, dm4; vshuffle x7, x7, x4, r1; vmac.f dm0, dm0, ex2, ex3, r3
; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vconv.bfp16ebs8.fp32 ex1, dm4; vshuffle x8, x9, x5, r0; vmac.f dm1, dm1, ex2, ex1, r3
; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4; vshuffle x9, x9, x5, r1; vmac.f dm2, dm2, ex0, ex3, r3
; CHECK-NEXT: nopa ; vldb x9, [p4, #0]; nopx ; vshuffle x9, x9, x5, r1; vconv.bfp16ebs8.fp32 ex3, dm4
; CHECK-NEXT: padda [p5], m5; vmul.f dm4, y3, y5, r2
; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2
; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]
; CHECK-NEXT: nop
; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2
; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]
; CHECK-NEXT: mov p5, p6; vmac.f dm3, dm3, ex0, ex1, r3
; CHECK-NEXT: vshuffle x6, x7, x4, r0; vmac.f dm2, dm2, ex0, ex3, r3
; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p6, #64]; vconv.bfp16ebs8.fp32 ex0, dm4; vshuffle x7, x7, x4, r1; vmac.f dm1, dm1, ex2, ex1, r3
; CHECK-NEXT: vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vconv.bfp16ebs8.fp32 ex1, dm4; vshuffle x8, x9, x5, r0; vmac.f dm0, dm0, ex2, ex3, r3
; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4; vshuffle x9, x9, x5, r1
; CHECK-NEXT: padda [p5], m5; vmul.f dm4, y3, y5, r2
; CHECK-NEXT: vlda.conv.fp32.bf16 cmh4, [p5, #64]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2
; CHECK-NEXT: vlda.conv.fp32.bf16 cml4, [p5, #0]
; CHECK-NEXT: vmac.f dm3, dm3, ex0, ex1, r3
; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex0, dm4; vmac.f dm0, dm0, ex2, ex3, r3
; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex1, dm4; vmac.f dm1, dm1, ex2, ex1, r3
; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4; vmac.f dm2, dm2, ex0, ex3, r3
; CHECK-NEXT: vmac.f dm2, dm2, ex0, ex3, r3
; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex0, dm4; vmac.f dm1, dm1, ex2, ex1, r3
; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex1, dm4; vmac.f dm0, dm0, ex2, ex3, r3
; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex3, dm4
; CHECK-NEXT: nop
; CHECK-NEXT: vconv.bfp16ebs8.fp32 ex2, dm4
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: vmac.f dm3, dm3, ex0, ex1, r3
; CHECK-NEXT: vmac.f dm0, dm0, ex2, ex3, r3
; CHECK-NEXT: vmac.f dm1, dm1, ex2, ex1, r3
; CHECK-NEXT: vmac.f dm2, dm2, ex0, ex3, r3
; CHECK-NEXT: vmac.f dm1, dm1, ex2, ex1, r3
; CHECK-NEXT: vmac.f dm0, dm0, ex2, ex3, r3
; CHECK-NEXT: ret lr
; CHECK-NEXT: nop // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
Expand Down
Loading