espressif
diff --git a/‎llvm/lib/Target/RISCV/RISCVDotprodSplitter.cpp‎
Lines changed: 61 additions & 10 deletions b/‎llvm/lib/Target/RISCV/RISCVDotprodSplitter.cpp‎
Lines changed: 61 additions & 10 deletions
diff --git a/‎llvm/lib/Target/RISCV/RISCVDotprodSplitter.h‎
Lines changed: 18 additions & 10 deletions b/‎llvm/lib/Target/RISCV/RISCVDotprodSplitter.h‎
Lines changed: 18 additions & 10 deletions
diff --git a/‎llvm/lib/Target/RISCV/RISCVTargetMachine.cpp‎
Lines changed: 9 additions & 1 deletion b/‎llvm/lib/Target/RISCV/RISCVTargetMachine.cpp‎
Lines changed: 9 additions & 1 deletion
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "RISCVDotprodSplitter.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
@@ -675,6 +676,46 @@ static bool isValidDotProductMultiply(Instruction &MulInst, Loop *L,
   return true;
 }
 
+/// Inner loops produced by Clang for stepped image/filter dot products often
+/// use \c mul(iv, invariant_step) for GEP indices and a separate integer MAC
+/// (\c mul(add(...), ...) -> widen -> add to i64 phi). That shape does not
+/// satisfy \c isValidDotProductMultiply (two load operands). For extraction we
+/// only need a recognizable MAC plus multiple affine loads in the same loop.
+static bool hasMacMulWithAffineLoadsPattern(Loop *L, ScalarEvolution &SE) {
+  bool HasMacMul = false;
+  unsigned AffineLoadCount = 0;
+  for (BasicBlock *BB : L->getBlocks()) {
+    for (Instruction &I : *BB) {
+      if (I.getOpcode() == Instruction::Mul && hasAccumulationPattern(I)) {
+        HasMacMul = true;
+        LLVM_DEBUG(dbgs() << "Found MAC-style multiply for extraction gate: "
+                          << I << "\n");
+      }
+      auto *Ld = dyn_cast<LoadInst>(&I);
+      if (!Ld)
+        continue;
+      if (!isSimpleForwardAccess(Ld, L, SE))
+        continue;
+      ++AffineLoadCount;
+      LLVM_DEBUG(dbgs() << "  Affine sequential load: " << *Ld << "\n");
+    }
+  }
+  if (!HasMacMul) {
+    LLVM_DEBUG(dbgs() << "No mul with accumulation chain in loop\n");
+    return false;
+  }
+
+  if (AffineLoadCount < 2) {
+    LLVM_DEBUG(dbgs() << "MAC gate: need >= 2 affine loads, have "
+                      << AffineLoadCount << "\n");
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "Matched Clang-style dotprod loop (MAC + "
+                    << AffineLoadCount << " affine loads)\n");
+  return true;
+}
+
 // Check for multiply-accumulate pattern (refactored for better readability)
 static bool hasMultiplyAccumulatePattern(Loop *L, ScalarEvolution &SE) {
   LLVM_DEBUG(
@@ -691,7 +732,11 @@ static bool hasMultiplyAccumulatePattern(Loop *L, ScalarEvolution &SE) {
   }
 
   LLVM_DEBUG(dbgs() << "No standard pattern found, checking offset pattern\n");
-  return hasOffsetDotProductPattern(L);
+  if (hasOffsetDotProductPattern(L))
+    return true;
+
+  LLVM_DEBUG(dbgs() << "Checking Clang-style MAC + affine loads pattern\n");
+  return hasMacMulWithAffineLoadsPattern(L, SE);
 }
 
 // Conditional LoopExtractor implementation
@@ -985,15 +1030,21 @@ static bool hasNestedLoopsWithProcessablePatterns(Function &F) {
 
   ScalarEvolution SE(F, TLI, AC, DT, LI);
 
-  // Check nested loops for multiply-accumulate patterns
-  for (Loop *L : LI.getLoopsInPreorder()) {
-    if (!L->getSubLoops().empty()) {
-      // Has nested loops, check if inner loops have multiply-accumulate pattern
-      for (Loop *InnerL : L->getSubLoops()) {
-        if (hasMultiplyAccumulatePattern(InnerL, SE)) {
-          return true;
-        }
-      }
+  // Walk every inner loop (descendant of a top-level loop) once; deeper nests
+  // may hold the per-x dot body.
+  SmallPtrSet<Loop *, 16> Seen;
+  SmallVector<Loop *, 8> Stack;
+  for (Loop *Top : LI) {
+    for (Loop *SL : Top->getSubLoops())
+      Stack.push_back(SL);
+    while (!Stack.empty()) {
+      Loop *Cur = Stack.pop_back_val();
+      if (!Seen.insert(Cur).second)
+        continue;
+      if (hasMultiplyAccumulatePattern(Cur, SE))
+        return true;
+      for (Loop *Child : Cur->getSubLoops())
+        Stack.push_back(Child);
     }
   }
 
 
@@ -7,11 +7,11 @@
 //===----------------------------------------------------------------------===//
 //
 // This file declares the RISCVDotprodSplitterPass class.
-// This pass identifies a specific pattern often associated with calls to inner
-// dot product computation functions, where the result is passed via a pointer
-// argument (typically an alloca in the caller). The pattern involves a
-// sequence of lifetime start, the call instruction, a load from the result
-// pointer, and lifetime end, all within the same basic block.
+// This pass identifies calls to inner dot-product helpers that pass the
+// running accumulator through a pointer (usually an entry-block alloca). The
+// expected caller shape is: optional @llvm.lifetime.start, the call, a single
+// reload from that slot, optional @llvm.lifetime.end, in one basic block; if
+// the reload was sunk to the unique successor, the pass can hoist it back.
 //
 // If this unique pattern is found, the pass restructures the control flow
 // graph (CFG) to create specialized paths for common constant "step" or
@@ -57,16 +57,24 @@ struct RISCVDotprodSplitterPass
 
   static bool isRequired() { return true; }
 
-  /// Check if the function contains patterns that can be processed by this
-  /// pass.
+  /// True if the function has a nested loop that looks like a dot-product inner
+  /// loop (legacy two-load \c mul, offset variant, or Clang-style MAC with
+  /// multiple affine loads). Used by the conditional loop extractor gate and
+  /// related tooling.
   static bool hasProcessablePattern(Function &F);
 };
 
 /// Conditional LoopExtractor Pass that only runs when dotprod patterns exist.
 ///
-/// This pass runs LoopExtractor only on modules that contain processable
-/// dot product patterns, avoiding unnecessary loop extraction on modules
-/// that won't benefit from the dotprod splitter optimization.
+/// This pass runs selective CodeExtractor-based loop extraction only on
+/// modules that contain processable dot product patterns (same heuristic as
+/// \c RISCVDotprodSplitterPass::hasProcessablePattern), avoiding work on other
+/// modules. Enable with the same \c -riscv-dotprod-splitter flag as the
+/// splitter pass.
+///
+/// Pipeline name (RISC-V target): \c riscv-dotprod-conditional-loop-extractor
+/// (module pass). Run before \c riscv-dotprod-splitter when extracting inner
+/// loops from Clang output is required.
 struct RISCVConditionalLoopExtractorPass
     : public PassInfoMixin<RISCVConditionalLoopExtractorPass> {
 
 
@@ -719,7 +719,15 @@ void RISCVTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
         }
         return false;
       });
-
+  PB.registerPipelineParsingCallback(
+      [](StringRef Name, ModulePassManager &MPM,
+         ArrayRef<PassBuilder::PipelineElement>) {
+        if (Name == "riscv-dotprod-conditional-loop-extractor") {
+          MPM.addPass(RISCVConditionalLoopExtractorPass());
+          return true;
+        }
+        return false;
+      });
   PB.registerOptimizerLastEPCallback([](ModulePassManager &PM,
                                         OptimizationLevel Level,
                                         ThinOrFullLTOPhase Phase) {