Xilinx · F-Stuckmann · May 13, 2026 · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026
@@ -618,7 +618,14 @@ SchedulingStage InterBlockScheduling::updateScheduling(BlockState &BS) {
   if (BS.getRegions().size() == 1) {
     auto &PostSWP = BS.getPostSWP();
     if (PostSWP.isPostPipelineCandidate(*BS.TheBlock)) {
-      BS.FixPoint.II = PostSWP.getResMII(*BS.TheBlock);
+      // A CLI --aie-postpipeliner-target-ii is a hard limit: start at
+      // exactly that II (bypassing --aie-postpipeliner-maxii) and let
+      // updatePipelining one-shot it. A pragma-driven TargetII is a soft
+      // hint: start at ResMII and iterate normally; the solver fallback at
+      // II == TargetII is handled inside the post-pipeliner.
+      BS.FixPoint.II = PostSWP.isTargetIIHardLimit()
+                           ? PostSWP.getTargetII()
+                           : PostSWP.getResMII(*BS.TheBlock);
       BS.FixPoint.IITries = 1;
       return SchedulingStage::Pipelining;
     }
@@ -632,11 +639,17 @@ SchedulingStage InterBlockScheduling::updatePipelining(BlockState &BS) {
     return BS.FixPoint.Stage;
   }
 
-  // Otherwise try a larger II.
-  // We cut off at larger IIs to prevent excessive compilation time.
-  if (++BS.FixPoint.II <= PostPipelinerMaxII &&
-      ++BS.FixPoint.IITries <= PostPipelinerMaxTryII) {
-    return SchedulingStage::Pipelining;
+  // A CLI --aie-postpipeliner-target-ii is one-shot: try only the requested
+  // II, even if it exceeds --aie-postpipeliner-maxii. If that attempt
+  // failed, do not try any other II. A pragma-driven TargetII keeps the
+  // normal iteration (ResMII..MaxII).
+  if (!BS.getPostSWP().isTargetIIHardLimit()) {
+    // Otherwise try a larger II.
+    // We cut off at larger IIs to prevent excessive compilation time.
+    if (++BS.FixPoint.II <= PostPipelinerMaxII &&
+        ++BS.FixPoint.IITries <= PostPipelinerMaxTryII) {
+      return SchedulingStage::Pipelining;
+    }
   }
 
   auto *BB = BS.TheBlock;

@@ -44,9 +44,16 @@ static cl::opt<int>
                   cl::desc("Number of runs for heuristics that converge"),
                   cl::init(20), cl::Hidden);
 
-static cl::opt<int> PresetII("aie-postpipeliner-target-ii",
-                             cl::desc("II for which to allow the solver"),
-                             cl::init(0), cl::Hidden);
+static cl::opt<bool>
+    UseSolver("aie-postpipeliner-solver",
+              cl::desc("Use the solver as fallback after heuristics fail"),
+              cl::init(false), cl::Hidden);
+
+static cl::opt<int>
+    PresetII("aie-postpipeliner-target-ii",
+             cl::desc("Run solver-only at this II; bypasses MaxII and "
+                      "skips heuristics"),
+             cl::init(0), cl::Hidden);
 
 PipelineScheduleVisitor::~PipelineScheduleVisitor() {}
 
@@ -160,16 +167,35 @@ bool PostPipeliner::isPostPipelineCandidate(MachineBasicBlock &LoopBlock) {
     return false;
   }
 
-  if (PresetII) {
-    TargetII = PresetII;
+  // No solver backend compiled in: TargetII/--aie-postpipeliner-solver
+  // are no-ops. Keep pre-commit behavior (heuristics only).
+  if (!Solver::hasSolver()) {
+    const bool AnyRequest =
+        PresetII || UseSolver || getInitiationInterval(getLoopID(LoopBlock));
+    if (AnyRequest) {
+      DEBUG_SUMMARY(
+          dbgs() << " PostPipeliner: ignoring TargetII/solver request, "
+                    "no solver compiled in\n");
+    }
     return true;
   }
-  auto ParsedInitiationInterval = getInitiationInterval(getLoopID(LoopBlock));
-  if (ParsedInitiationInterval) {
-    TargetII = *ParsedInitiationInterval;
-    DEBUG_SUMMARY(dbgs() << " PostPipeliner: TargetII=" << TargetII << "\n");
+
+  // --aie-postpipeliner-target-ii: hard one-shot. Bypasses MaxII and
+  // skips heuristics; only the solver runs at exactly this II.
+  if (PresetII) {
+    TargetII = PresetII;
+    TargetIIIsHardLimit = true;
+  } else if (!UseSolver) {
+    // Pragma soft hint: heuristics iterate normally and the solver runs
+    // at II == TargetII. --aie-postpipeliner-solver overrides this.
+    if (const auto Pragma = getInitiationInterval(getLoopID(LoopBlock)))
+      TargetII = *Pragma;
   }
 
+  if (TargetII)
+    DEBUG_SUMMARY(dbgs() << " PostPipeliner: TargetII=" << TargetII
+                         << (TargetIIIsHardLimit ? " (hard)" : " (soft)")
+                         << "\n");
   return true;
 }
 
@@ -1431,8 +1457,7 @@ static const ConfigStrategy::Configuration Heuristics[] = {
     {1, false, false, 1, {Prio::NodeNum}, {}}, // pure bottom up
 };
 
-bool PostPipeliner::tryApproaches() {
-  DEBUG_SUMMARY(dbgs() << "-- MinLength=" << MinLength << "\n");
+bool PostPipeliner::runHeuristics() {
   int HeuristicIndex = 0;
   for (const auto &Config : Heuristics) {
     if (Heuristic >= 0 && Heuristic != HeuristicIndex++) {
@@ -1459,27 +1484,45 @@ bool PostPipeliner::tryApproaches() {
     }
     DEBUG_SUMMARY(dbgs() << "    Strategy " << S.name() << " failed\n");
   }
+  // Last-chance heuristic: relax the iteration-count constraint.
   IterCountSlackStrategy Relaxed(*DAG, Info, MinLength + II);
   resetSchedule(/*FullReset=*/true);
-  if (scheduleWithStrategy(Relaxed)) {
+  return scheduleWithStrategy(Relaxed);
+}
+
+bool PostPipeliner::runSolverFallback() {
+  const SolverData Data = createSolverData();
+  const int NS = MinLength / II;
+  if (solve(Data, NS, false)) {
     return true;
   }
-
-  // TargetII is the OK from the user to spend some time reaching this II.
-  // Therefore, if we haven't found a solution yet, bring in the big guns.
-  if (II == TargetII) {
-    const SolverData Data = createSolverData();
-    int NS = MinLength / II;
-    if (solve(Data, NS, false)) {
-      return true;
-    }
-    if (NS == MinTripCount) {
-      // Only try this at the boundary case
-      if (solve(Data, NS + 1, true)) {
-        return true;
-      }
-    }
+  // Let's try SEF solution.
+  if (solve(Data, NS + 1, true)) {
+    return true;
   }
+  // Marsshot: last try with full NS + 1.
+  return solve(Data, NS + 1, false);
+}
+
+bool PostPipeliner::tryApproaches() {
+  DEBUG_SUMMARY(dbgs() << "-- MinLength=" << MinLength << "\n");
+
+  // CLI --aie-postpipeliner-target-ii: solver-only, skip heuristics.
+  const bool SolverOnly = TargetIIIsHardLimit;
+  const bool RunHeuristics = !SolverOnly;
+
+  // Solver runs at this II if the user asked for solver fallback at every
+  // II, or this II matches a TargetII (CLI hard or pragma soft hint).
+  const bool SolverAtThisII =
+      UseSolver || SolverOnly || (TargetII != 0 && II == TargetII);
+  // Belt-and-braces re-check: never call solve() with no backend, even
+  // though isPostPipelineCandidate already filtered the request out.
+  const bool RunSolver = Solver::hasSolver() && SolverAtThisII;
+
+  if (RunHeuristics && runHeuristics())
+    return true;
+  if (RunSolver && runSolverFallback())
+    return true;
 
   DEBUG_SUMMARY(dbgs() << "=== II=" << II << " Failed ===\n");
   return false;

@@ -247,11 +247,14 @@ class PostPipeliner {
   /// The minimum tripcount, read from the pragma, or from an LC initialization.
   int MinTripCount = 0;
 
-  /// The II requested by a pragma. This will trigger expensive algorithms
-  /// like solvers or exhaustive searches to be run if the heuristic methods
-  /// don't find a solution.
+  /// User/pragma-requested II at which the solver is additionally run.
+  /// Stays 0 when no solver backend is compiled in.
   int TargetII = 0;
 
+  /// True when TargetII is a hard CLI one-shot (skip heuristics, bypass
+  /// MaxII), false when it's a soft pragma hint.
+  bool TargetIIIsHardLimit = false;
+
   /// The Preheader of the loop.
   MachineBasicBlock *Preheader = nullptr;
 
@@ -316,6 +319,14 @@ class PostPipeliner {
   /// If it returns true, a valid schedule is laid down in Info.
   bool tryApproaches();
 
+  /// Run the heuristic strategies (each ConfigStrategy plus the relaxed
+  /// IterCountSlackStrategy fallback) at the current II.
+  bool runHeuristics();
+
+  /// Run the solver-based last-resort attempts at the current II:
+  /// (NS, !SEF), (NS+1, SEF), (NS+1, !SEF).
+  bool runSolverFallback();
+
   /// Find the first available unscheduled instruction with the highest
   /// priority.
   int mostUrgent(PostPipelinerStrategy &Strategy);
@@ -346,6 +357,14 @@ class PostPipeliner {
   /// \pre isPostPipelineCandidate has returned true
   int getResMII(MachineBasicBlock &LoopBlock);
 
+  /// Return the user/pragma-requested II, or 0 if none was set.
+  /// \pre isPostPipelineCandidate has returned true
+  int getTargetII() const { return TargetII; }
+
+  /// True when TargetII is a hard CLI one-shot (vs a soft pragma hint).
+  /// \pre isPostPipelineCandidate has returned true
+  bool isTargetIIHardLimit() const { return TargetIIIsHardLimit; }
+
   // Schedule using the given InitiationInterval. Return true when successful.
   // In that case calls to the query methods below are legitimate.
   bool schedule(ScheduleDAGMI &DAG, int InitiationInterval,

@@ -58,6 +58,14 @@ std::vector<std::unique_ptr<SWPSolver>> getSolvers() {
   return Solvers;
 }
 
+bool hasSolver() {
+#if LLVM_WITH_Z3
+  return true;
+#else
+  return false;
+#endif // LLVM_WITH_Z3
+}
+
 Slot &SolverData::addSlot(int N) {
   auto It = Slots.emplace(N, Slot(N)).first;
   return It->second;

@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates
+// (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates
 //
 //===----------------------------------------------------------------------===//
 
@@ -183,6 +183,11 @@ class SWPSolver {
 // Return the set of solvers to try
 std::vector<std::unique_ptr<SWPSolver>> getSolvers();
 
+/// Return true if at least one SWP solver backend is compiled into this
+/// build. When false, getSolvers() returns an empty vector and any code
+/// path that depends on the solver must fall back gracefully.
+bool hasSolver();
+
 #if LLVM_WITH_Z3
 class Z3Solver : public SWPSolver {
 protected:

@@ -3,13 +3,14 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates
+# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates
 
 # REQUIRES: enable_z3_solver
 
 # RUN: llc -verify-machineinstrs --mtriple=aie2p -O2 %s \
 # RUN:   --start-before=postmisched \
 # RUN:   --aie-postpipeliner-heuristic-runs=1 \
+# RUN:   --aie-postpipeliner-solver \
 # RUN:   -o - | FileCheck %s
 
 
@@ -22,65 +23,64 @@
   ; CHECK-LABEL: gemm:
   ; CHECK:         .p2align 4
   ; CHECK-NEXT:  // %bb.0: // %entry
-  ; CHECK-NEXT:    nopa ; vldb x4, [p7, #64]; nopxm
+  ; CHECK-NEXT:    vldb x4, [p7, #64]; nopx
   ; CHECK-NEXT:    vldb.3d x7, [p7], d0; movs p4, p7
   ; CHECK-NEXT:    paddb [p4], m4
-  ; CHECK-NEXT:    vldb x9, [p4, #0]
   ; CHECK-NEXT:    vldb x5, [p4, #64]
+  ; CHECK-NEXT:    vldb x9, [p4, #0]
   ; CHECK-NEXT:    nop
   ; CHECK-NEXT:    nop
   ; CHECK-NEXT:    nop
   ; CHECK-NEXT:    vldb x4, [p7, #64]; mov p5, p6
   ; CHECK-NEXT:    movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x6, x7, x4, r0
   ; CHECK-NEXT:    vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; vshuffle x7, x7, x4, r1
-  ; CHECK-NEXT:    vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x9, [p4, #0]; vshuffle x8, x9, x5, r0
-  ; CHECK-NEXT:    vldb x5, [p4, #64]; vshuffle x9, x9, x5, r1
+  ; CHECK-NEXT:    vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x5, [p4, #64]; vshuffle x8, x9, x5, r0
+  ; CHECK-NEXT:    vldb x9, [p4, #0]; vshuffle x9, x9, x5, r1
   ; CHECK-NEXT:    padda [p5], m5; add.nc lc, r0, #-3; vmul.f dm4, y3, y5, r2
-  ; CHECK-NEXT:    vlda.conv.fp32.bf16 cml4, [p5, #0]; movxm ls, #.LBB0_1; vmul.f dm4, y4, y5, r2
-  ; CHECK-NEXT:    vlda.conv.fp32.bf16 cmh4, [p5, #64]; movxm le, #.L_LEnd0
+  ; CHECK-NEXT:    vlda.conv.fp32.bf16 cmh4, [p5, #64]; movxm ls, #.LBB0_1; vmul.f dm4, y4, y5, r2
+  ; CHECK-NEXT:    vlda.conv.fp32.bf16 cml4, [p5, #0]; movxm le, #.L_LEnd0
   ; CHECK-NEXT:    vldb x4, [p7, #64]; mov p5, p6
   ; CHECK-NEXT:    movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x6, x7, x4, r0
   ; CHECK-NEXT:    vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; vconv.bfp16ebs8.fp32 ex0, dm4; vshuffle x7, x7, x4, r1
-  ; CHECK-NEXT:    vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x9, [p4, #0]; vconv.bfp16ebs8.fp32 ex1, dm4; vshuffle x8, x9, x5, r0
-  ; CHECK-NEXT:    vconv.bfp16ebs8.fp32 ex3, dm4; vldb x5, [p4, #64]; vshuffle x9, x9, x5, r1
+  ; CHECK-NEXT:    vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x5, [p4, #64]; vconv.bfp16ebs8.fp32 ex1, dm4; vshuffle x8, x9, x5, r0
   ; CHECK-NEXT:    .p2align 4
   ; CHECK-NEXT:  .LBB0_1: // %for.body
   ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-  ; CHECK-NEXT:    padda [p5], m5; nopb ; nops ; nopxm ; vmul.f dm4, y3, y5, r2
-  ; CHECK-NEXT:    vlda.conv.fp32.bf16 cml4, [p5, #0]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2
-  ; CHECK-NEXT:    vlda.conv.fp32.bf16 cmh4, [p5, #64]
-  ; CHECK-NEXT:    vldb x4, [p7, #64]; mov p5, p6
-  ; CHECK-NEXT:    movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x6, x7, x4, r0; vmac.f dm3, dm3, ex0, ex1, r3
-  ; CHECK-NEXT:    vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; vconv.bfp16ebs8.fp32 ex0, dm4; nopx ; vshuffle x7, x7, x4, r1; vmac.f dm0, dm0, ex2, ex3, r3
-  ; CHECK-NEXT:    vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x9, [p4, #0]; vconv.bfp16ebs8.fp32 ex1, dm4; nopx ; vshuffle x8, x9, x5, r0; vmac.f dm1, dm1, ex2, ex1, r3
+  ; CHECK-NEXT:    nopa ; vldb x9, [p4, #0]; vconv.bfp16ebs8.fp32 ex3, dm4; nopx ; vshuffle x9, x9, x5, r1; nopv
+  ; CHECK-NEXT:    padda [p5], m5; nopb ; nopx ; vmul.f dm4, y3, y5, r2
+  ; CHECK-NEXT:    vlda.conv.fp32.bf16 cmh4, [p5, #64]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2
+  ; CHECK-NEXT:    vlda.conv.fp32.bf16 cml4, [p5, #0]
+  ; CHECK-NEXT:    vldb x4, [p7, #64]; mov p5, p6; vmac.f dm3, dm3, ex0, ex1, r3
+  ; CHECK-NEXT:    movs p4, p7; vldb.3d x7, [p7], d0; vshuffle x6, x7, x4, r0; vmac.f dm2, dm2, ex0, ex3, r3
+  ; CHECK-NEXT:    vlda.conv.fp32.bf16 cmh4, [p6, #64]; paddb [p4], m4; vconv.bfp16ebs8.fp32 ex0, dm4; nopx ; vshuffle x7, x7, x4, r1; vmac.f dm1, dm1, ex2, ex1, r3
   ; CHECK-NEXT:  .L_LEnd0:
-  ; CHECK-NEXT:    nopa ; vldb x5, [p4, #64]; vconv.bfp16ebs8.fp32 ex3, dm4; nopx ; vshuffle x9, x9, x5, r1; vmac.f dm2, dm2, ex0, ex3, r3
+  ; CHECK-NEXT:    vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vldb x5, [p4, #64]; vconv.bfp16ebs8.fp32 ex1, dm4; nopx ; vshuffle x8, x9, x5, r0; vmac.f dm0, dm0, ex2, ex3, r3
   ; CHECK-NEXT:    .p2align 4
   ; CHECK-NEXT:  // %bb.2:
-  ; CHECK-NEXT:    padda [p5], m5; nopb ; nops ; nopxm ; vmul.f dm4, y3, y5, r2
-  ; CHECK-NEXT:    vlda.conv.fp32.bf16 cml4, [p5, #0]; nopb ; nopx ; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2
-  ; CHECK-NEXT:    vlda.conv.fp32.bf16 cmh4, [p5, #64]
-  ; CHECK-NEXT:    mov p5, p6
-  ; CHECK-NEXT:    vshuffle x6, x7, x4, r0; vmac.f dm3, dm3, ex0, ex1, r3
-  ; CHECK-NEXT:    vlda.conv.fp32.bf16 cmh4, [p6, #64]; vconv.bfp16ebs8.fp32 ex0, dm4; vshuffle x7, x7, x4, r1; vmac.f dm0, dm0, ex2, ex3, r3
-  ; CHECK-NEXT:    vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vconv.bfp16ebs8.fp32 ex1, dm4; vshuffle x8, x9, x5, r0; vmac.f dm1, dm1, ex2, ex1, r3
-  ; CHECK-NEXT:    vconv.bfp16ebs8.fp32 ex3, dm4; vshuffle x9, x9, x5, r1; vmac.f dm2, dm2, ex0, ex3, r3
+  ; CHECK-NEXT:    nopa ; vldb x9, [p4, #0]; nopx ; vshuffle x9, x9, x5, r1; vconv.bfp16ebs8.fp32 ex3, dm4
   ; CHECK-NEXT:    padda [p5], m5; vmul.f dm4, y3, y5, r2
-  ; CHECK-NEXT:    vlda.conv.fp32.bf16 cml4, [p5, #0]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2
-  ; CHECK-NEXT:    vlda.conv.fp32.bf16 cmh4, [p5, #64]
-  ; CHECK-NEXT:    nop
+  ; CHECK-NEXT:    vlda.conv.fp32.bf16 cmh4, [p5, #64]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2
+  ; CHECK-NEXT:    vlda.conv.fp32.bf16 cml4, [p5, #0]
+  ; CHECK-NEXT:    mov p5, p6; vmac.f dm3, dm3, ex0, ex1, r3
+  ; CHECK-NEXT:    vshuffle x6, x7, x4, r0; vmac.f dm2, dm2, ex0, ex3, r3
+  ; CHECK-NEXT:    vlda.conv.fp32.bf16 cmh4, [p6, #64]; vconv.bfp16ebs8.fp32 ex0, dm4; vshuffle x7, x7, x4, r1; vmac.f dm1, dm1, ex2, ex1, r3
+  ; CHECK-NEXT:    vlda.3d.conv.fp32.bf16 cml4, [p6], d1; vconv.bfp16ebs8.fp32 ex1, dm4; vshuffle x8, x9, x5, r0; vmac.f dm0, dm0, ex2, ex3, r3
+  ; CHECK-NEXT:    vconv.bfp16ebs8.fp32 ex3, dm4; vshuffle x9, x9, x5, r1
+  ; CHECK-NEXT:    padda [p5], m5; vmul.f dm4, y3, y5, r2
+  ; CHECK-NEXT:    vlda.conv.fp32.bf16 cmh4, [p5, #64]; vconv.bfp16ebs8.fp32 ex2, dm4; vmul.f dm4, y4, y5, r2
+  ; CHECK-NEXT:    vlda.conv.fp32.bf16 cml4, [p5, #0]
   ; CHECK-NEXT:    vmac.f dm3, dm3, ex0, ex1, r3
-  ; CHECK-NEXT:    vconv.bfp16ebs8.fp32 ex0, dm4; vmac.f dm0, dm0, ex2, ex3, r3
-  ; CHECK-NEXT:    vconv.bfp16ebs8.fp32 ex1, dm4; vmac.f dm1, dm1, ex2, ex1, r3
-  ; CHECK-NEXT:    vconv.bfp16ebs8.fp32 ex3, dm4; vmac.f dm2, dm2, ex0, ex3, r3
+  ; CHECK-NEXT:    vmac.f dm2, dm2, ex0, ex3, r3
+  ; CHECK-NEXT:    vconv.bfp16ebs8.fp32 ex0, dm4; vmac.f dm1, dm1, ex2, ex1, r3
+  ; CHECK-NEXT:    vconv.bfp16ebs8.fp32 ex1, dm4; vmac.f dm0, dm0, ex2, ex3, r3
+  ; CHECK-NEXT:    vconv.bfp16ebs8.fp32 ex3, dm4
   ; CHECK-NEXT:    nop
   ; CHECK-NEXT:    vconv.bfp16ebs8.fp32 ex2, dm4
   ; CHECK-NEXT:    nop
-  ; CHECK-NEXT:    nop
   ; CHECK-NEXT:    vmac.f dm3, dm3, ex0, ex1, r3
-  ; CHECK-NEXT:    vmac.f dm0, dm0, ex2, ex3, r3
-  ; CHECK-NEXT:    vmac.f dm1, dm1, ex2, ex1, r3
   ; CHECK-NEXT:    vmac.f dm2, dm2, ex0, ex3, r3
+  ; CHECK-NEXT:    vmac.f dm1, dm1, ex2, ex1, r3
+  ; CHECK-NEXT:    vmac.f dm0, dm0, ex2, ex3, r3
   ; CHECK-NEXT:    ret lr
   ; CHECK-NEXT:    nop // Delay Slot 5
   ; CHECK-NEXT:    nop // Delay Slot 4