diff --git a/llvm/include/llvm/Analysis/IVUsers.h b/llvm/include/llvm/Analysis/IVUsers.h
index 2af3e389446c..8ff708b3a8ef 100644
--- a/llvm/include/llvm/Analysis/IVUsers.h
+++ b/llvm/include/llvm/Analysis/IVUsers.h
@@ -4,6 +4,9 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
+// Modifications (c) Copyright 2026 Advanced Micro Devices, Inc. or its
+// affiliates
+//
 //===----------------------------------------------------------------------===//
 //
 // This file implements bookkeeping for "interesting" users of expressions
@@ -26,6 +29,7 @@ class AssumptionCache;
 class DominatorTree;
 class ScalarEvolution;
 class SCEV;
+class TargetTransformInfo;
 class IVUsers;
 
 /// IVStrideUse - Keep track of one use of a strided induction variable.
@@ -95,6 +99,7 @@ class IVUsers {
   LoopInfo *LI;
   DominatorTree *DT;
   ScalarEvolution *SE;
+  const TargetTransformInfo *TTI;
   SmallPtrSet<Instruction*, 16> Processed;
 
   /// IVUses - A list of all tracked IV uses of induction variable expressions
@@ -106,12 +111,13 @@ class IVUsers {
 
 public:
   IVUsers(Loop *L, AssumptionCache *AC, LoopInfo *LI, DominatorTree *DT,
-          ScalarEvolution *SE);
+          ScalarEvolution *SE, const TargetTransformInfo *TTI = nullptr);
 
   IVUsers(IVUsers &&X)
       : L(std::move(X.L)), AC(std::move(X.AC)), DT(std::move(X.DT)),
-        SE(std::move(X.SE)), Processed(std::move(X.Processed)),
-        IVUses(std::move(X.IVUses)), EphValues(std::move(X.EphValues)) {
+        SE(std::move(X.SE)), TTI(std::move(X.TTI)),
+        Processed(std::move(X.Processed)), IVUses(std::move(X.IVUses)),
+        EphValues(std::move(X.EphValues)) {
     for (IVStrideUse &U : IVUses)
       U.Parent = this;
   }
@@ -121,10 +127,11 @@ class IVUsers {
 
   Loop *getLoop() const { return L; }
 
-  /// AddUsersIfInteresting - Inspect the specified Instruction.  If it is a
-  /// reducible SCEV, recursively add its users to the IVUsesByStride set and
-  /// return true.  Otherwise, return false.
-  bool AddUsersIfInteresting(Instruction *I);
+  /// Inspect the specified Instruction. If it is a reducible SCEV, recursively
+  /// add its users to the IVUsesByStride set and return true. Otherwise,
+  /// return false. If \p BypassWidthCheck is true, skip the type width
+  /// validation (used when the caller has already verified the type via TTI).
+  bool AddUsersIfInteresting(Instruction *I, bool BypassWidthCheck = false);
 
   IVStrideUse &AddUser(Instruction *User, Value *Operand);
 
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 8b69a8c16287..8c6d1b7e5875 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// Modifications (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its
+// Modifications (c) Copyright 2023-2026 Advanced Micro Devices, Inc. or its
 // affiliates
 //
 //===----------------------------------------------------------------------===//
@@ -66,6 +66,8 @@ class SmallBitVector;
 class StoreInst;
 class SwitchInst;
 class TargetLibraryInfo;
+class TruncInst;
+class GetElementPtrInst;
 class Type;
 class VPIntrinsic;
 struct KnownBits;
@@ -798,6 +800,22 @@ class TargetTransformInfo {
   AddressingModeKind getPreferredAddressingMode(const Loop *L,
                                                 ScalarEvolution *SE) const;
 
+  /// Return true if IVUsers() should look through the instruction to collect
+  /// its users instead. If true, populates GEPsToProcess with the GEP
+  /// instructions to process as IV users.
+  /// This is useful for targets where pointer and integer bit sizes differ
+  /// (e.g., 20-bit pointers with 32-bit integers), causing truncs to index
+  /// size that feed GEP indices.
+  bool shouldIVUsersLookThroughInst(
+      Instruction *I,
+      SmallVectorImpl<GetElementPtrInst *> &GEPsToProcess) const;
+
+  /// Return true if the given type is valid for IV user collection.
+  /// By default, only legal integer widths up to 64 bits are allowed.
+  /// Targets where pointer and integer bit sizes differ may override this
+  /// to allow index-sized integers or pointers.
+  bool isValidIVUserType(Type *Ty) const;
+
   /// Return true if the target supports masked store.
   bool isLegalMaskedStore(Type *DataType, Align Alignment) const;
   /// Return true if the target supports masked load.
@@ -2009,6 +2027,10 @@ class TargetTransformInfo::Concept {
                           TargetLibraryInfo *LibInfo) = 0;
   virtual AddressingModeKind
     getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const = 0;
+  virtual bool shouldIVUsersLookThroughInst(
+      Instruction *I,
+      SmallVectorImpl<GetElementPtrInst *> &GEPsToProcess) const = 0;
+  virtual bool isValidIVUserType(Type *Ty) const = 0;
   virtual bool isLegalMaskedStore(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalMaskedLoad(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalNTStore(Type *DataType, Align Alignment) = 0;
@@ -2553,6 +2575,14 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
                                ScalarEvolution *SE) const override {
     return Impl.getPreferredAddressingMode(L, SE);
   }
+  bool shouldIVUsersLookThroughInst(
+      Instruction *I,
+      SmallVectorImpl<GetElementPtrInst *> &GEPsToProcess) const override {
+    return Impl.shouldIVUsersLookThroughInst(I, GEPsToProcess);
+  }
+  bool isValidIVUserType(Type *Ty) const override {
+    return Impl.isValidIVUserType(Ty);
+  }
   bool isLegalMaskedStore(Type *DataType, Align Alignment) override {
     return Impl.isLegalMaskedStore(DataType, Alignment);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 5d3e83ed537e..74c14a0e610d 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// Modifications (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its
+// Modifications (c) Copyright 2023-2026 Advanced Micro Devices, Inc. or its
 // affiliates
 //
 //===----------------------------------------------------------------------===//
@@ -283,6 +283,22 @@ class TargetTransformInfoImplBase {
     return TTI::AMK_None;
   }
 
+  /// By default, do not look through instructions in IVUsers.
+  bool shouldIVUsersLookThroughInst(
+      Instruction *I,
+      SmallVectorImpl<GetElementPtrInst *> &GEPsToProcess) const {
+    return false;
+  }
+
+  /// By default, only legal integer widths up to 64 bits are valid for IV
+  /// users.
+  bool isValidIVUserType(Type *Ty) const {
+    if (!Ty->isIntegerTy() && !Ty->isPointerTy())
+      return false;
+    const unsigned Width = DL.getTypeSizeInBits(Ty);
+    return Width <= 64 && DL.isLegalInteger(Width);
+  }
+
   bool isLegalMaskedStore(Type *DataType, Align Alignment) const {
     return false;
   }
diff --git a/llvm/lib/Analysis/IVUsers.cpp b/llvm/lib/Analysis/IVUsers.cpp
index 0880701d8308..1da358753295 100644
--- a/llvm/lib/Analysis/IVUsers.cpp
+++ b/llvm/lib/Analysis/IVUsers.cpp
@@ -4,6 +4,9 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
+// Modifications (c) Copyright 2026 Advanced Micro Devices, Inc. or its
+// affiliates
+//
 //===----------------------------------------------------------------------===//
 //
 // This file implements bookkeeping for "interesting" users of expressions
@@ -18,6 +21,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DataLayout.h"
@@ -35,7 +39,7 @@ AnalysisKey IVUsersAnalysis::Key;
 
 IVUsers IVUsersAnalysis::run(Loop &L, LoopAnalysisManager &AM,
                              LoopStandardAnalysisResults &AR) {
-  return IVUsers(&L, &AR.AC, &AR.LI, &AR.DT, &AR.SE);
+  return IVUsers(&L, &AR.AC, &AR.LI, &AR.DT, &AR.SE, &AR.TTI);
 }
 
 char IVUsersWrapperPass::ID = 0;
@@ -133,7 +137,7 @@ static bool IVUseShouldUsePostIncValue(Instruction *User, Value *Operand,
 /// Inspect the specified instruction.  If it is a reducible SCEV, recursively
 /// add its users to the IVUsesByStride set and return true.  Otherwise, return
 /// false.
-bool IVUsers::AddUsersIfInteresting(Instruction *I) {
+bool IVUsers::AddUsersIfInteresting(Instruction *I, bool BypassWidthCheck) {
   const DataLayout &DL = I->getDataLayout();
 
   // Add this IV user to the Processed set before returning false to ensure that
@@ -153,9 +157,16 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) {
   // LSR is not APInt clean, do not touch integers bigger than 64-bits.
   // Also avoid creating IVs of non-native types. For example, we don't want a
   // 64-bit IV in 32-bit code just because the loop has one 64-bit cast.
-  uint64_t Width = SE->getTypeSizeInBits(I->getType());
-  if (Width > 64 || !DL.isLegalInteger(Width))
-    return false;
+  // Use TTI hook if available to allow targets where pointer and integer bit
+  // sizes differ (e.g., 20-bit pointers with 32-bit integers) to enable IV
+  // user collection for index-sized types.
+  if (!BypassWidthCheck) {
+    const uint64_t Width = SE->getTypeSizeInBits(I->getType());
+    const bool IsValidType = TTI ? TTI->isValidIVUserType(I->getType())
+                                 : (Width <= 64 && DL.isLegalInteger(Width));
+    if (!IsValidType)
+      return false;
+  }
 
   // Don't attempt to promote ephemeral values to indvars. They will be removed
   // later anyway.
@@ -170,6 +181,18 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) {
   if (!isInteresting(ISE, I, L, SE, LI))
     return false;
 
+  // Allow targets to look through certain instructions (e.g., truncs to index
+  // size on targets where pointer and integer bit sizes differ) to collect
+  // their users instead. This enables LSR to create pointer PHIs.
+  SmallVector<GetElementPtrInst *, 4> GEPsToProcess;
+  if (TTI && TTI->shouldIVUsersLookThroughInst(I, GEPsToProcess)) {
+    LLVM_DEBUG(dbgs() << "Looking through instruction: " << *I << '\n');
+    bool AnyInteresting = false;
+    for (GetElementPtrInst *GEP : GEPsToProcess)
+      AnyInteresting |= AddUsersIfInteresting(GEP, /*BypassWidthCheck=*/true);
+    return AnyInteresting;
+  }
+
   SmallPtrSet<Instruction *, 4> UniqueUsers;
   for (Use &U : I->uses()) {
     Instruction *User = cast<Instruction>(U.getUser());
@@ -249,8 +272,8 @@ IVStrideUse &IVUsers::AddUser(Instruction *User, Value *Operand) {
 }
 
 IVUsers::IVUsers(Loop *L, AssumptionCache *AC, LoopInfo *LI, DominatorTree *DT,
-                 ScalarEvolution *SE)
-    : L(L), AC(AC), LI(LI), DT(DT), SE(SE) {
+                 ScalarEvolution *SE, const TargetTransformInfo *TTI)
+    : L(L), AC(AC), LI(LI), DT(DT), SE(SE), TTI(TTI) {
   // Collect ephemeral values so that AddUsersIfInteresting skips them.
   EphValues.clear();
   CodeMetrics::collectEphemeralValues(L, AC, EphValues);
@@ -306,6 +329,7 @@ void IVUsersWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<LoopInfoWrapperPass>();
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.addRequired<ScalarEvolutionWrapperPass>();
+  AU.addRequired<TargetTransformInfoWrapperPass>();
   AU.setPreservesAll();
 }
 
@@ -315,8 +339,10 @@ bool IVUsersWrapperPass::runOnLoop(Loop *L, LPPassManager &LPM) {
   auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+      *L->getHeader()->getParent());
 
-  IU.reset(new IVUsers(L, AC, LI, DT, SE));
+  IU.reset(new IVUsers(L, AC, LI, DT, SE, TTI));
   return false;
 }
 
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 7f770fd1efce..e44a493b9298 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// Modifications (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its
+// Modifications (c) Copyright 2023-2026 Advanced Micro Devices, Inc. or its
 // affiliates
 //
 //===----------------------------------------------------------------------===//
@@ -474,6 +474,15 @@ TargetTransformInfo::getPreferredAddressingMode(const Loop *L,
   return TTIImpl->getPreferredAddressingMode(L, SE);
 }
 
+bool TargetTransformInfo::shouldIVUsersLookThroughInst(
+    Instruction *I, SmallVectorImpl<GetElementPtrInst *> &GEPsToProcess) const {
+  return TTIImpl->shouldIVUsersLookThroughInst(I, GEPsToProcess);
+}
+
+bool TargetTransformInfo::isValidIVUserType(Type *Ty) const {
+  return TTIImpl->isValidIVUserType(Ty);
+}
+
 bool TargetTransformInfo::isLegalMaskedStore(Type *DataType,
                                              Align Alignment) const {
   return TTIImpl->isLegalMaskedStore(DataType, Alignment);
diff --git a/llvm/lib/Target/AIE/AIEBaseTargetTransformInfo.h b/llvm/lib/Target/AIE/AIEBaseTargetTransformInfo.h
index 4bad608ecae9..b37b38985074 100644
--- a/llvm/lib/Target/AIE/AIEBaseTargetTransformInfo.h
+++ b/llvm/lib/Target/AIE/AIEBaseTargetTransformInfo.h
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates
+// (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates
 //
 //===----------------------------------------------------------------------===//
 //
@@ -64,6 +64,10 @@ template <typename T> class AIEBaseTTIImpl : public BasicTTIImplBase<T> {
   virtual ~AIEBaseTTIImpl() = default;
 
 public:
+  //===--------------------------------------------------------------------===//
+  // Cost Model
+  //===--------------------------------------------------------------------===//
+
   int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) {
     // TODO Handle Target Specific constant cost
     //  Larger constants require an add.
@@ -76,6 +80,11 @@ template <typename T> class AIEBaseTTIImpl : public BasicTTIImplBase<T> {
     // cost?
     return TTI::TCC_Basic;
   }
+
+  //===--------------------------------------------------------------------===//
+  // Loop Optimization
+  //===--------------------------------------------------------------------===//
+
   void adjustUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                   TTI::UnrollingPreferences &UP,
                                   OptimizationRemarkEmitter *ORE);
@@ -83,6 +92,10 @@ template <typename T> class AIEBaseTTIImpl : public BasicTTIImplBase<T> {
                                 AssumptionCache &AC, TargetLibraryInfo *LibInfo,
                                 HardwareLoopInfo &HWLoopInfo);
 
+  //===--------------------------------------------------------------------===//
+  // Vectorization
+  //===--------------------------------------------------------------------===//
+
   // We define a store vector factor of  4 for 8-bit and 2 for 16-bit. This
   // allows combining 2 16-bit stores or 4 8-bit stores into a single 32-bit
   // vector store. This is deemed beneficial because of the LMS nature of
@@ -114,6 +127,93 @@ template <typename T> class AIEBaseTTIImpl : public BasicTTIImplBase<T> {
     // Default return of allowsMisalignedMemoryAccesses is false.
     return ChainSizeInBytes >= 4;
   }
+
+  //===--------------------------------------------------------------------===//
+  // Loop Strength Reduction (LSR)
+  //
+  // AIE has 20-bit pointers but 32-bit integers, and post-increment load/store
+  // instructions (VLD_pstm, VST_pstm). These hooks enable LSR to generate
+  // pointer recurrences that the backend can combine with post-increment ops.
+  //===--------------------------------------------------------------------===//
+
+  /// Check if type is an integer matching the target's index size (e.g., i20).
+  /// Note: uses address space 0; all AIE address spaces share the same index
+  /// width.
+  static bool isIndexSizedInteger(Type *Ty, const DataLayout &DL) {
+    return Ty->isIntegerTy() &&
+           Ty->getIntegerBitWidth() == DL.getIndexSizeInBits(0);
+  }
+
+  /// Collect all GEP users of \p Trunc that use it as an index operand (not
+  /// the pointer operand). Returns false if any use is not a GEP index.
+  static bool collectGEPIndices(const TruncInst *Trunc,
+                                SmallVectorImpl<GetElementPtrInst *> &GEPs) {
+    for (const Use &U : Trunc->uses()) {
+      auto *GEP = dyn_cast<GetElementPtrInst>(U.getUser());
+      if (!GEP || U.getOperandNo() == 0)
+        return false;
+      GEPs.push_back(GEP);
+    }
+    return true;
+  }
+
+  /// Prefer pointer-based recurrences over scalar offset + base formulations.
+  TTI::AddressingModeKind
+  getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const {
+    return TTI::AMK_PostIndexed;
+  }
+
+  /// Enable post-increment addressing for index-sized integers (i20).
+  bool isIndexedLoadLegal(TTI::MemIndexedMode Mode, Type *Ty,
+                          const DataLayout &DL) const {
+    return Mode == TTI::MIM_PostInc && isIndexSizedInteger(Ty, DL);
+  }
+
+  bool isIndexedStoreLegal(TTI::MemIndexedMode Mode, Type *Ty,
+                           const DataLayout &DL) const {
+    return Mode == TTI::MIM_PostInc && isIndexSizedInteger(Ty, DL);
+  }
+
+  /// Look through truncs to index size that feed GEP indices.
+  ///
+  /// Array indexing generates: %trunc = trunc i32 %idx to i20
+  /// Without this hook, IVUsers() stops at the trunc (i20 not legal).
+  /// With this hook, IVUsers() continues to the GEP, collecting pointer SCEVs.
+  bool shouldIVUsersLookThroughInst(
+      Instruction *I,
+      SmallVectorImpl<GetElementPtrInst *> &GEPsToProcess) const {
+    auto *Trunc = dyn_cast<TruncInst>(I);
+    if (!Trunc)
+      return false;
+
+    if (!Trunc->getType()->isIntegerTy())
+      return false;
+
+    const DataLayout &DL = Trunc->getModule()->getDataLayout();
+    const unsigned TruncWidth = Trunc->getType()->getIntegerBitWidth();
+    // All AIE address spaces share the same index width; use address space 0.
+    const unsigned IndexWidth = DL.getIndexSizeInBits(/*AS=*/0);
+
+    if (TruncWidth != IndexWidth || DL.isLegalInteger(TruncWidth))
+      return false;
+
+    return collectGEPIndices(Trunc, GEPsToProcess);
+  }
+
+  /// Prioritize fewer loop-body adds over fewer recurrences.
+  /// For VLIW, extra adds hurt II while extra PHIs execute in parallel.
+  bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const {
+    return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
+                    C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
+           std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
+                    C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
+  }
+
+  /// Extend valid IV user types to include index-sized integers (i20).
+  bool isValidIVUserType(Type *Ty) const {
+    return BaseT::isValidIVUserType(Ty) ||
+           isIndexSizedInteger(Ty, BaseT::getDataLayout());
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AIE/AIEClusterBaseAddress.cpp b/llvm/lib/Target/AIE/AIEClusterBaseAddress.cpp
index b5e0f70b0e74..4abc4ac49765 100644
--- a/llvm/lib/Target/AIE/AIEClusterBaseAddress.cpp
+++ b/llvm/lib/Target/AIE/AIEClusterBaseAddress.cpp
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates
+// (c) Copyright 2023-2026 Advanced Micro Devices, Inc. or its affiliates
 //
 //===----------------------------------------------------------------------===//
 //
@@ -198,6 +198,13 @@ class AIEClusterBaseAddress : public MachineFunctionPass {
                         std::optional<ValueAndVReg> OffsetA,
                         std::optional<ValueAndVReg> OffsetB);
 
+  // Insert G_PTR_ADD with zero offset for load/store instructions that
+  // directly use a pointer register which also has G_PTR_ADD users.
+  // This ensures bare offset-0 accesses participate in post-increment
+  // chaining built by buildChain.
+  bool insertPtrAddForBareMemOps(MachineBasicBlock &MBB, MachineIRBuilder &MIB,
+                                 GISelObserverWrapper &Observer);
+
   // Return true if the instructions are used by both loads and stores.
   bool hasMixedLoadStoreUse(SmallVector<MachineInstr *, 2> Instrs);
 
@@ -263,6 +270,8 @@ bool AIEClusterBaseAddress::processBasicBlock(MachineBasicBlock &MBB,
 
   bool Changed = false;
 
+  Changed |= insertPtrAddForBareMemOps(MBB, MIB, Observer);
+
   // Get all G_PTR_ADDs that use the same pointer.
   RegUseMap RegAndUses = collectPtrUses(MBB);
 
@@ -291,6 +300,86 @@ bool AIEClusterBaseAddress::processBasicBlock(MachineBasicBlock &MBB,
   return Changed;
 }
 
+bool AIEClusterBaseAddress::insertPtrAddForBareMemOps(
+    MachineBasicBlock &MBB, MachineIRBuilder &MIB,
+    GISelObserverWrapper &Observer) {
+  bool Changed = false;
+
+  for (MachineInstr &PHI : MBB) {
+    if (!PHI.isPHI())
+      return Changed;
+
+    const Register PhiReg = PHI.getOperand(0).getReg();
+    const bool IsPointerPhi = MRI->getType(PhiReg).isPointer();
+    if (!IsPointerPhi)
+      continue;
+
+    // Single walk over the MBB in program order to collect:
+    // - PtrAddFeedingMemOpCount: G_PTR_ADDs from this PHI that feed mem ops
+    // - BareMemOps: load/store instructions that use the PHI directly
+    // - Whether each bare mem op precedes the first such G_PTR_ADD
+    unsigned PtrAddFeedingMemOpCount = 0;
+    SmallVector<std::pair<MachineInstr *, bool>, 4> BareMemOps;
+
+    for (MachineInstr &MI : MBB) {
+      if (MI.isPHI())
+        continue;
+
+      const bool UsesPhiAsBase = MI.getNumOperands() > 1 &&
+                                 MI.getOperand(1).isReg() &&
+                                 MI.getOperand(1).getReg() == PhiReg;
+      if (!UsesPhiAsBase)
+        continue;
+
+      const bool IsPtrAdd = MI.getOpcode() == TargetOpcode::G_PTR_ADD;
+      if (IsPtrAdd) {
+        const bool FeedsMemOp =
+            any_of(MRI->use_nodbg_instructions(MI.getOperand(0).getReg()),
+                   [](const MachineInstr &U) { return U.mayLoadOrStore(); });
+        if (FeedsMemOp)
+          PtrAddFeedingMemOpCount++;
+        continue;
+      }
+
+      const bool IsBareMemOp =
+          MI.mayLoadOrStore() && MI.getNumMemOperands() > 0;
+      if (IsBareMemOp) {
+        const bool PrecedesChain = PtrAddFeedingMemOpCount == 0;
+        BareMemOps.push_back({&MI, PrecedesChain});
+      }
+    }
+
+    // We need at least 2 PTR_ADDs feeding memory ops to form a chain.
+    // With only 1, the inserted PTR_ADD +0 has no chain partner and
+    // survives as a redundant padda #0.
+    if (PtrAddFeedingMemOpCount < 2 || BareMemOps.empty())
+      continue;
+
+    for (auto &[MemOp, PrecedesChain] : BareMemOps) {
+      // Skip insertion if the bare mem op precedes all G_PTR_ADDs from
+      // this PHI that feed memory ops. Such a mem op is the "first" user
+      // of the PHI pointer and can be combined with a post-increment
+      // update (e.g., add.2d/add.3d) into a single post-increment load.
+      // Inserting G_PTR_ADD +0 would break this combination.
+      if (PrecedesChain)
+        continue;
+
+      const unsigned AddrIdx = 1;
+      MIB.setInsertPt(MBB, MemOp->getIterator());
+      const auto ZeroOffset = MIB.buildConstant(LLT::scalar(20), 0);
+      const auto NewPtr =
+          MIB.buildInstr(TargetOpcode::G_PTR_ADD, {MRI->getType(PhiReg)},
+                         {PhiReg, ZeroOffset.getReg(0)});
+      Observer.changingInstr(*MemOp);
+      MemOp->getOperand(AddrIdx).setReg(NewPtr.getReg(0));
+      Observer.changedInstr(*MemOp);
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
 /// Recursively search bottom up for Load instrs in the use chain of \p MI .
 /// Stop the search when Exiting \p MBB . Return all found Load MachineInstr in
 /// \p LoadsFeedingInstrs .
diff --git a/llvm/test/CodeGen/AIE/GlobalISel/cluster-base-address-bare-memop-e2e.mir b/llvm/test/CodeGen/AIE/GlobalISel/cluster-base-address-bare-memop-e2e.mir
new file mode 100644
index 000000000000..fcbe4e1f0d0c
--- /dev/null
+++ b/llvm/test/CodeGen/AIE/GlobalISel/cluster-base-address-bare-memop-e2e.mir
@@ -0,0 +1,147 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates
+#
+# End-to-end test: run from legalizer through custom combiner to verify that
+# PTR_ADD +0 inserted by AIEClusterBaseAddress gets fully absorbed into
+# post-increment chains with no surviving padda #0.
+#
+# RUN: llc -mtriple aie2 -start-after=legalizer \
+# RUN:   -stop-after=aie2-postlegalizer-custom-combiner \
+# RUN:   %s -verify-machineinstrs -o - | FileCheck %s
+
+# Bare load at offset 0 is in the MIDDLE of the access pattern (-128, 0, +128).
+# The PTR_ADD +0 should be chained and combined into post-increment loads.
+# No G_PTR_ADD with i20 0 should remain.
+---
+name:            bare_load_middle
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: bare_load_middle
+  ; CHECK-NOT: G_PTR_ADD {{.*}}, {{.*}}i20 0
+  bb.0:
+    successors: %bb.1
+    liveins: $p0, $r0
+    %0:_(p0) = COPY $p0
+    %1:_(s32) = COPY $r0
+    %100:_(s20) = G_CONSTANT i20 256
+    %2:_(p0) = G_PTR_ADD %0, %100(s20)
+    G_BR %bb.1
+
+  bb.1:
+    successors: %bb.2, %bb.1
+    %3:_(p0) = G_PHI %10(p0), %bb.1, %2(p0), %bb.0
+    %13:_(s32) = G_PHI %15(s32), %bb.1, %1(s32), %bb.0
+    %4:_(s20) = G_CONSTANT i20 -128
+    %5:_(p0) = G_PTR_ADD %3, %4(s20)
+    %6:_(<16 x s32>) = G_LOAD %5(p0) :: (load (<16 x s32>), align 64)
+    %7:_(<16 x s32>) = G_LOAD %3(p0) :: (load (<16 x s32>), align 64)
+    %8:_(s20) = G_CONSTANT i20 128
+    %9:_(p0) = G_PTR_ADD %3, %8(s20)
+    %11:_(<16 x s32>) = G_LOAD %9(p0) :: (load (<16 x s32>), align 64)
+    %12:_(s20) = G_CONSTANT i20 512
+    %10:_(p0) = G_PTR_ADD %3, %12(s20)
+    %14:_(s32) = G_CONSTANT i32 -1
+    %15:_(s32) = G_ADD %13, %14
+    %16:_(s32) = G_CONSTANT i32 0
+    %17:_(s1) = G_ICMP intpred(eq), %15(s32), %16
+    G_BRCOND %17(s1), %bb.2
+    G_BR %bb.1
+
+  bb.2:
+    $x0 = COPY %6
+    $x2 = COPY %7
+    $x4 = COPY %11
+    PseudoRET implicit $lr, implicit $x0, implicit $x2, implicit $x4
+...
+
+# Bare load at offset 0 is at the START of the access pattern (0, +128, +256).
+# The PTR_ADD +0 should be chained and combined into post-increment loads.
+---
+name:            bare_load_start
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: bare_load_start
+  ; CHECK-NOT: G_PTR_ADD {{.*}}, {{.*}}i20 0
+  bb.0:
+    successors: %bb.1
+    liveins: $p0, $r0
+    %0:_(p0) = COPY $p0
+    %1:_(s32) = COPY $r0
+    G_BR %bb.1
+
+  bb.1:
+    successors: %bb.2, %bb.1
+    %3:_(p0) = G_PHI %10(p0), %bb.1, %0(p0), %bb.0
+    %13:_(s32) = G_PHI %15(s32), %bb.1, %1(s32), %bb.0
+    %7:_(<16 x s32>) = G_LOAD %3(p0) :: (load (<16 x s32>), align 64)
+    %8:_(s20) = G_CONSTANT i20 128
+    %5:_(p0) = G_PTR_ADD %3, %8(s20)
+    %6:_(<16 x s32>) = G_LOAD %5(p0) :: (load (<16 x s32>), align 64)
+    %18:_(s20) = G_CONSTANT i20 256
+    %9:_(p0) = G_PTR_ADD %3, %18(s20)
+    %11:_(<16 x s32>) = G_LOAD %9(p0) :: (load (<16 x s32>), align 64)
+    %12:_(s20) = G_CONSTANT i20 512
+    %10:_(p0) = G_PTR_ADD %3, %12(s20)
+    %14:_(s32) = G_CONSTANT i32 -1
+    %15:_(s32) = G_ADD %13, %14
+    %16:_(s32) = G_CONSTANT i32 0
+    %17:_(s1) = G_ICMP intpred(eq), %15(s32), %16
+    G_BRCOND %17(s1), %bb.2
+    G_BR %bb.1
+
+  bb.2:
+    $x0 = COPY %7
+    $x2 = COPY %6
+    $x4 = COPY %11
+    PseudoRET implicit $lr, implicit $x0, implicit $x2, implicit $x4
+...
+
+# Bare load at offset 0 is at the END of the access pattern (-256, -128, 0).
+# The PTR_ADD +0 should be chained and combined into post-increment loads.
+---
+name:            bare_load_end
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: bare_load_end
+  ; CHECK-NOT: G_PTR_ADD {{.*}}, {{.*}}i20 0
+  bb.0:
+    successors: %bb.1
+    liveins: $p0, $r0
+    %0:_(p0) = COPY $p0
+    %1:_(s32) = COPY $r0
+    %100:_(s20) = G_CONSTANT i20 512
+    %2:_(p0) = G_PTR_ADD %0, %100(s20)
+    G_BR %bb.1
+
+  bb.1:
+    successors: %bb.2, %bb.1
+    %3:_(p0) = G_PHI %10(p0), %bb.1, %2(p0), %bb.0
+    %13:_(s32) = G_PHI %15(s32), %bb.1, %1(s32), %bb.0
+    %4:_(s20) = G_CONSTANT i20 -256
+    %5:_(p0) = G_PTR_ADD %3, %4(s20)
+    %6:_(<16 x s32>) = G_LOAD %5(p0) :: (load (<16 x s32>), align 64)
+    %18:_(s20) = G_CONSTANT i20 -128
+    %19:_(p0) = G_PTR_ADD %3, %18(s20)
+    %20:_(<16 x s32>) = G_LOAD %19(p0) :: (load (<16 x s32>), align 64)
+    %7:_(<16 x s32>) = G_LOAD %3(p0) :: (load (<16 x s32>), align 64)
+    %12:_(s20) = G_CONSTANT i20 512
+    %10:_(p0) = G_PTR_ADD %3, %12(s20)
+    %14:_(s32) = G_CONSTANT i32 -1
+    %15:_(s32) = G_ADD %13, %14
+    %16:_(s32) = G_CONSTANT i32 0
+    %17:_(s1) = G_ICMP intpred(eq), %15(s32), %16
+    G_BRCOND %17(s1), %bb.2
+    G_BR %bb.1
+
+  bb.2:
+    $x0 = COPY %6
+    $x2 = COPY %20
+    $x4 = COPY %7
+    PseudoRET implicit $lr, implicit $x0, implicit $x2, implicit $x4
+...
diff --git a/llvm/test/CodeGen/AIE/GlobalISel/cluster-base-address-bare-memop.mir b/llvm/test/CodeGen/AIE/GlobalISel/cluster-base-address-bare-memop.mir
new file mode 100644
index 000000000000..f531ef2ebe57
--- /dev/null
+++ b/llvm/test/CodeGen/AIE/GlobalISel/cluster-base-address-bare-memop.mir
@@ -0,0 +1,363 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates
+# RUN: llc -mtriple aie2 -run-pass=aie-cluster-base-address %s -verify-machineinstrs -o - | FileCheck %s
+
+# Test that bare load/store instructions using a PHI-defined pointer (offset 0)
+# get a G_PTR_ADD +0 inserted when the PHI also has G_PTR_ADD users whose
+# outputs feed load/store instructions, enabling post-increment chaining.
+
+# Positive test: PHI with 3 PTR_ADD users feeding loads (-128, +128, +512),
+# plus a bare load at offset 0. The bare load gets PTR_ADD +0 and all loads
+# end up chained.
+---
+name:            phi_load_bare_offset_zero
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: phi_load_bare_offset_zero
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $p0, $r0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $r0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 256
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20)
+  ; CHECK-NEXT:   G_BR %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(p0) = G_PHI %5(p0), %bb.1, [[PTR_ADD]](p0), %bb.0
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[COPY1]](s32), %bb.0
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 -128
+  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[PHI]], [[C1]](s20)
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p0) :: (load (<16 x s32>))
+  ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 0
+  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s20) = G_CONSTANT i20 128
+  ; CHECK-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C3]](s20)
+  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD2]](p0) :: (load (<16 x s32>))
+  ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s20) = G_CONSTANT i20 128
+  ; CHECK-NEXT:   [[C5:%[0-9]+]]:_(s20) = G_CONSTANT i20 128
+  ; CHECK-NEXT:   [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD2]], [[C5]](s20)
+  ; CHECK-NEXT:   [[LOAD2:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD3]](p0) :: (load (<16 x s32>))
+  ; CHECK-NEXT:   [[C6:%[0-9]+]]:_(s20) = G_CONSTANT i20 512
+  ; CHECK-NEXT:   [[C7:%[0-9]+]]:_(s20) = G_CONSTANT i20 384
+  ; CHECK-NEXT:   [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C7]](s20)
+  ; CHECK-NEXT:   [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C8]]
+  ; CHECK-NEXT:   [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C9]]
+  ; CHECK-NEXT:   G_BRCOND [[ICMP]](s1), %bb.2
+  ; CHECK-NEXT:   G_BR %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   $x0 = COPY [[LOAD]](<16 x s32>)
+  ; CHECK-NEXT:   $x2 = COPY [[LOAD1]](<16 x s32>)
+  ; CHECK-NEXT:   $x4 = COPY [[LOAD2]](<16 x s32>)
+  ; CHECK-NEXT:   PseudoRET implicit $lr, implicit $x0, implicit $x2, implicit $x4
+  bb.0:
+    successors: %bb.1
+    liveins: $p0, $r0
+    %0:_(p0) = COPY $p0
+    %1:_(s32) = COPY $r0
+    %100:_(s20) = G_CONSTANT i20 256
+    %2:_(p0) = G_PTR_ADD %0, %100(s20)
+    G_BR %bb.1
+
+  bb.1:
+    successors: %bb.2, %bb.1
+    %3:_(p0) = G_PHI %10(p0), %bb.1, %2(p0), %bb.0
+    %13:_(s32) = G_PHI %15(s32), %bb.1, %1(s32), %bb.0
+    %4:_(s20) = G_CONSTANT i20 -128
+    %5:_(p0) = G_PTR_ADD %3, %4(s20)
+    %6:_(<16 x s32>) = G_LOAD %5(p0) :: (load (<16 x s32>), align 64)
+    %7:_(<16 x s32>) = G_LOAD %3(p0) :: (load (<16 x s32>), align 64)
+    %8:_(s20) = G_CONSTANT i20 128
+    %9:_(p0) = G_PTR_ADD %3, %8(s20)
+    %11:_(<16 x s32>) = G_LOAD %9(p0) :: (load (<16 x s32>), align 64)
+    %12:_(s20) = G_CONSTANT i20 512
+    %10:_(p0) = G_PTR_ADD %3, %12(s20)
+    %14:_(s32) = G_CONSTANT i32 -1
+    %15:_(s32) = G_ADD %13, %14
+    %16:_(s32) = G_CONSTANT i32 0
+    %17:_(s1) = G_ICMP intpred(eq), %15(s32), %16
+    G_BRCOND %17(s1), %bb.2
+    G_BR %bb.1
+
+  bb.2:
+    $x0 = COPY %6
+    $x2 = COPY %7
+    $x4 = COPY %11
+    PseudoRET implicit $lr, implicit $x0, implicit $x2, implicit $x4
+...
+
+# Negative test: PHI has only 1 PTR_ADD feeding a load (the other is the
+# loop increment). No PTR_ADD +0 should be inserted.
+---
+name:            phi_load_single_ptr_add_feeding_load
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: phi_load_single_ptr_add_feeding_load
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $p0, $r0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $r0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 256
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20)
+  ; CHECK-NEXT:   G_BR %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(p0) = G_PHI %5(p0), %bb.1, [[PTR_ADD]](p0), %bb.0
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[COPY1]](s32), %bb.0
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PHI]](p0) :: (load (<16 x s32>))
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 128
+  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[PHI]], [[C1]](s20)
+  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p0) :: (load (<16 x s32>))
+  ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 256
+  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s20) = G_CONSTANT i20 128
+  ; CHECK-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C3]](s20)
+  ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C4]]
+  ; CHECK-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C5]]
+  ; CHECK-NEXT:   G_BRCOND [[ICMP]](s1), %bb.2
+  ; CHECK-NEXT:   G_BR %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   $x0 = COPY [[LOAD]](<16 x s32>)
+  ; CHECK-NEXT:   $x2 = COPY [[LOAD1]](<16 x s32>)
+  ; CHECK-NEXT:   PseudoRET implicit $lr, implicit $x0, implicit $x2
+  bb.0:
+    successors: %bb.1
+    liveins: $p0, $r0
+    %0:_(p0) = COPY $p0
+    %1:_(s32) = COPY $r0
+    %100:_(s20) = G_CONSTANT i20 256
+    %2:_(p0) = G_PTR_ADD %0, %100(s20)
+    G_BR %bb.1
+
+  bb.1:
+    successors: %bb.2, %bb.1
+    %3:_(p0) = G_PHI %10(p0), %bb.1, %2(p0), %bb.0
+    %13:_(s32) = G_PHI %15(s32), %bb.1, %1(s32), %bb.0
+    %7:_(<16 x s32>) = G_LOAD %3(p0) :: (load (<16 x s32>), align 64)
+    %8:_(s20) = G_CONSTANT i20 128
+    %9:_(p0) = G_PTR_ADD %3, %8(s20)
+    %11:_(<16 x s32>) = G_LOAD %9(p0) :: (load (<16 x s32>), align 64)
+    %12:_(s20) = G_CONSTANT i20 256
+    %10:_(p0) = G_PTR_ADD %3, %12(s20)
+    %14:_(s32) = G_CONSTANT i32 -1
+    %15:_(s32) = G_ADD %13, %14
+    %16:_(s32) = G_CONSTANT i32 0
+    %17:_(s1) = G_ICMP intpred(eq), %15(s32), %16
+    G_BRCOND %17(s1), %bb.2
+    G_BR %bb.1
+
+  bb.2:
+    $x0 = COPY %7
+    $x2 = COPY %11
+    PseudoRET implicit $lr, implicit $x0, implicit $x2
+...
+
+# Negative test: no PTR_ADD +0 when address is not PHI-defined.
+---
+name:            non_phi_load_no_insert
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $p0
+    ; CHECK-LABEL: name: non_phi_load_no_insert
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 -64
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20)
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<16 x s32>))
+    ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p0) :: (load (<16 x s32>))
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 128
+    ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD]], [[C2]](s20)
+    ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p0) :: (load (<16 x s32>))
+    ; CHECK-NEXT: $x0 = COPY [[LOAD]](<16 x s32>)
+    ; CHECK-NEXT: $x2 = COPY [[LOAD1]](<16 x s32>)
+    ; CHECK-NEXT: $x4 = COPY [[LOAD2]](<16 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0, implicit $x2, implicit $x4
+    %0:_(p0) = COPY $p0
+    %1:_(s20) = G_CONSTANT i20 -64
+    %2:_(p0) = G_PTR_ADD %0, %1(s20)
+    %3:_(<16 x s32>) = G_LOAD %2(p0) :: (load (<16 x s32>), align 64)
+    %4:_(<16 x s32>) = G_LOAD %0(p0) :: (load (<16 x s32>), align 64)
+    %5:_(s20) = G_CONSTANT i20 64
+    %6:_(p0) = G_PTR_ADD %0, %5(s20)
+    %7:_(<16 x s32>) = G_LOAD %6(p0) :: (load (<16 x s32>), align 64)
+    $x0 = COPY %3
+    $x2 = COPY %4
+    $x4 = COPY %7
+    PseudoRET implicit $lr, implicit $x0, implicit $x2, implicit $x4
+...
+
+# Positive test: store at offset 0 from PHI also gets PTR_ADDs feeding stores.
+---
+name:            phi_store_bare_offset_zero
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: phi_store_bare_offset_zero
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $p0, $r0, $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $r0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(<8 x s64>) = COPY $x0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 128
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20)
+  ; CHECK-NEXT:   G_BR %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(p0) = G_PHI %6(p0), %bb.1, [[PTR_ADD]](p0), %bb.0
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %8(s32), %bb.1, [[COPY1]](s32), %bb.0
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 -64
+  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[PHI]], [[C1]](s20)
+  ; CHECK-NEXT:   G_STORE [[COPY2]](<8 x s64>), [[PTR_ADD1]](p0) :: (store (<8 x s64>))
+  ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 0
+  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+  ; CHECK-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C3]](s20)
+  ; CHECK-NEXT:   G_STORE [[COPY2]](<8 x s64>), [[PTR_ADD2]](p0) :: (store (<8 x s64>))
+  ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+  ; CHECK-NEXT:   [[C5:%[0-9]+]]:_(s20) = G_CONSTANT i20 64
+  ; CHECK-NEXT:   [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD2]], [[C5]](s20)
+  ; CHECK-NEXT:   G_STORE [[COPY2]](<8 x s64>), [[PTR_ADD3]](p0) :: (store (<8 x s64>))
+  ; CHECK-NEXT:   [[C6:%[0-9]+]]:_(s20) = G_CONSTANT i20 256
+  ; CHECK-NEXT:   [[C7:%[0-9]+]]:_(s20) = G_CONSTANT i20 192
+  ; CHECK-NEXT:   [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C7]](s20)
+  ; CHECK-NEXT:   [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C8]]
+  ; CHECK-NEXT:   [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C9]]
+  ; CHECK-NEXT:   G_BRCOND [[ICMP]](s1), %bb.2
+  ; CHECK-NEXT:   G_BR %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   PseudoRET implicit $lr
+  bb.0:
+    successors: %bb.1
+    liveins: $p0, $r0, $x0
+    %0:_(p0) = COPY $p0
+    %1:_(s32) = COPY $r0
+    %20:_(<8 x s64>) = COPY $x0
+    %100:_(s20) = G_CONSTANT i20 128
+    %2:_(p0) = G_PTR_ADD %0, %100(s20)
+    G_BR %bb.1
+
+  bb.1:
+    successors: %bb.2, %bb.1
+    %3:_(p0) = G_PHI %10(p0), %bb.1, %2(p0), %bb.0
+    %13:_(s32) = G_PHI %15(s32), %bb.1, %1(s32), %bb.0
+    %4:_(s20) = G_CONSTANT i20 -64
+    %5:_(p0) = G_PTR_ADD %3, %4(s20)
+    G_STORE %20(<8 x s64>), %5(p0) :: (store (<8 x s64>), align 64)
+    G_STORE %20(<8 x s64>), %3(p0) :: (store (<8 x s64>), align 64)
+    %8:_(s20) = G_CONSTANT i20 64
+    %9:_(p0) = G_PTR_ADD %3, %8(s20)
+    G_STORE %20(<8 x s64>), %9(p0) :: (store (<8 x s64>), align 64)
+    %12:_(s20) = G_CONSTANT i20 256
+    %10:_(p0) = G_PTR_ADD %3, %12(s20)
+    %14:_(s32) = G_CONSTANT i32 -1
+    %15:_(s32) = G_ADD %13, %14
+    %16:_(s32) = G_CONSTANT i32 0
+    %17:_(s1) = G_ICMP intpred(eq), %15(s32), %16
+    G_BRCOND %17(s1), %bb.2
+    G_BR %bb.1
+
+  bb.2:
+    PseudoRET implicit $lr
+...
+
+# Negative test: bare load from PHI appears BEFORE all G_PTR_ADDs that feed
+# loads. The bare load is the first user of the PHI pointer and should NOT
+# get G_PTR_ADD +0 inserted, because it can be combined with a post-increment
+# update (e.g., add.2d/add.3d) into a single post-increment load instruction.
+---
+name:            phi_bare_load_before_ptr_adds_no_insert
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: phi_bare_load_before_ptr_adds_no_insert
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $p0, $r0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $p0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $r0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 256
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20)
+  ; CHECK-NEXT:   G_BR %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(p0) = G_PHI %5(p0), %bb.1, [[PTR_ADD]](p0), %bb.0
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[COPY1]](s32), %bb.0
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PHI]](p0) :: (load (<16 x s32>))
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 128
+  ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[PHI]], [[C1]](s20)
+  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p0) :: (load (<16 x s32>))
+  ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 256
+  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s20) = G_CONSTANT i20 128
+  ; CHECK-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C3]](s20)
+  ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C4]]
+  ; CHECK-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C5]]
+  ; CHECK-NEXT:   G_BRCOND [[ICMP]](s1), %bb.2
+  ; CHECK-NEXT:   G_BR %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   $x0 = COPY [[LOAD]](<16 x s32>)
+  ; CHECK-NEXT:   $x2 = COPY [[LOAD1]](<16 x s32>)
+  ; CHECK-NEXT:   PseudoRET implicit $lr, implicit $x0, implicit $x2
+  bb.0:
+    successors: %bb.1
+    liveins: $p0, $r0
+    %0:_(p0) = COPY $p0
+    %1:_(s32) = COPY $r0
+    %100:_(s20) = G_CONSTANT i20 256
+    %2:_(p0) = G_PTR_ADD %0, %100(s20)
+    G_BR %bb.1
+
+  bb.1:
+    successors: %bb.2, %bb.1
+    %3:_(p0) = G_PHI %10(p0), %bb.1, %2(p0), %bb.0
+    %13:_(s32) = G_PHI %15(s32), %bb.1, %1(s32), %bb.0
+    %7:_(<16 x s32>) = G_LOAD %3(p0) :: (load (<16 x s32>), align 64)
+    %8:_(s20) = G_CONSTANT i20 128
+    %9:_(p0) = G_PTR_ADD %3, %8(s20)
+    %11:_(<16 x s32>) = G_LOAD %9(p0) :: (load (<16 x s32>), align 64)
+    %12:_(s20) = G_CONSTANT i20 256
+    %10:_(p0) = G_PTR_ADD %3, %12(s20)
+    %14:_(s32) = G_CONSTANT i32 -1
+    %15:_(s32) = G_ADD %13, %14
+    %16:_(s32) = G_CONSTANT i32 0
+    %17:_(s1) = G_ICMP intpred(eq), %15(s32), %16
+    G_BRCOND %17(s1), %bb.2
+    G_BR %bb.1
+
+  bb.2:
+    $x0 = COPY %7
+    $x2 = COPY %11
+    PseudoRET implicit $lr, implicit $x0, implicit $x2
+...
diff --git a/llvm/test/CodeGen/AIE/hardware-loops/irtranslator-zol.ll b/llvm/test/CodeGen/AIE/hardware-loops/irtranslator-zol.ll
index 0dfde4656137..bee67bf03f78 100644
--- a/llvm/test/CodeGen/AIE/hardware-loops/irtranslator-zol.ll
+++ b/llvm/test/CodeGen/AIE/hardware-loops/irtranslator-zol.ll
@@ -4,7 +4,7 @@
 ; See https://llvm.org/LICENSE.txt for license information.
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 ;
-; (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates
+; (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates
 
 ; RUN: llc -O2 -mtriple=aie2 -stop-after=irtranslator --enable-aie-hardware-loops --enable-aie-zero-overhead-loops \
 ; RUN:    --aie-force-hl-gen=true %s -o - | FileCheck %s --check-prefix=AIE2
@@ -25,7 +25,6 @@ define void @simple(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef
   ; AIE2-NEXT:   [[COPY1:%[0-9]+]]:_(p0) = COPY $p1
   ; AIE2-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $r0
   ; AIE2-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; AIE2-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; AIE2-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32) from %ir.out)
   ; AIE2-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.set.loop.iterations), [[COPY2]](s32)
   ; AIE2-NEXT:   G_BR %bb.3
@@ -36,17 +35,13 @@ define void @simple(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef
   ; AIE2-NEXT: bb.3.for.body:
   ; AIE2-NEXT:   successors: %bb.3(0x7c000000), %bb.2(0x04000000)
   ; AIE2-NEXT: {{  $}}
-  ; AIE2-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI [[LOAD]](s32), %bb.1, %12(s32), %bb.3
-  ; AIE2-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1, %14(s32), %bb.3
-  ; AIE2-NEXT:   [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[PHI1]](s32)
-  ; AIE2-NEXT:   [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 4
-  ; AIE2-NEXT:   [[MUL:%[0-9]+]]:_(s20) = G_MUL [[TRUNC]], [[C2]]
-  ; AIE2-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[MUL]](s20)
-  ; AIE2-NEXT:   [[COPY3:%[0-9]+]]:_(p0) = COPY [[PTR_ADD]](p0)
-  ; AIE2-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[COPY3]](p0) :: (load (s32) from %ir.arrayidx)
-  ; AIE2-NEXT:   [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD [[PHI]], [[LOAD1]]
+  ; AIE2-NEXT:   [[PHI:%[0-9]+]]:_(p0) = G_PHI %9(p0), %bb.3, [[COPY1]](p0), %bb.1
+  ; AIE2-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[LOAD]](s32), %bb.1, %7(s32), %bb.3
+  ; AIE2-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PHI]](p0) :: (load (s32) from %ir.lsr.iv1)
+  ; AIE2-NEXT:   [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD [[PHI1]], [[LOAD1]]
   ; AIE2-NEXT:   G_STORE [[ADD]](s32), [[COPY]](p0) :: (store (s32) into %ir.out)
-  ; AIE2-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = nuw nsw G_ADD [[PHI1]], [[C]]
+  ; AIE2-NEXT:   [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 4
+  ; AIE2-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[PHI]], [[C1]](s20)
   ; AIE2-NEXT:   [[INT:%[0-9]+]]:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.loop.decrement), [[C]](s32)
   ; AIE2-NEXT:   G_BRCOND [[INT]](s1), %bb.3
   ; AIE2-NEXT:   G_BR %bb.2
@@ -60,7 +55,6 @@ define void @simple(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef
   ; AIE2p-NEXT:   [[COPY1:%[0-9]+]]:_(p0) = COPY $p1
   ; AIE2p-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $r0
   ; AIE2p-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; AIE2p-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; AIE2p-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32) from %ir.out)
   ; AIE2p-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.set.loop.iterations), [[COPY2]](s32)
   ; AIE2p-NEXT:   G_BR %bb.3
@@ -71,17 +65,13 @@ define void @simple(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef
   ; AIE2p-NEXT: bb.3.for.body:
   ; AIE2p-NEXT:   successors: %bb.3(0x7c000000), %bb.2(0x04000000)
   ; AIE2p-NEXT: {{  $}}
-  ; AIE2p-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI [[LOAD]](s32), %bb.1, %12(s32), %bb.3
-  ; AIE2p-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1, %14(s32), %bb.3
-  ; AIE2p-NEXT:   [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[PHI1]](s32)
-  ; AIE2p-NEXT:   [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 4
-  ; AIE2p-NEXT:   [[MUL:%[0-9]+]]:_(s20) = G_MUL [[TRUNC]], [[C2]]
-  ; AIE2p-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[MUL]](s20)
-  ; AIE2p-NEXT:   [[COPY3:%[0-9]+]]:_(p0) = COPY [[PTR_ADD]](p0)
-  ; AIE2p-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[COPY3]](p0) :: (load (s32) from %ir.arrayidx)
-  ; AIE2p-NEXT:   [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD [[PHI]], [[LOAD1]]
+  ; AIE2p-NEXT:   [[PHI:%[0-9]+]]:_(p0) = G_PHI %9(p0), %bb.3, [[COPY1]](p0), %bb.1
+  ; AIE2p-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[LOAD]](s32), %bb.1, %7(s32), %bb.3
+  ; AIE2p-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PHI]](p0) :: (load (s32) from %ir.lsr.iv1)
+  ; AIE2p-NEXT:   [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD [[PHI1]], [[LOAD1]]
   ; AIE2p-NEXT:   G_STORE [[ADD]](s32), [[COPY]](p0) :: (store (s32) into %ir.out)
-  ; AIE2p-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = nuw nsw G_ADD [[PHI1]], [[C]]
+  ; AIE2p-NEXT:   [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 4
+  ; AIE2p-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[PHI]], [[C1]](s20)
   ; AIE2p-NEXT:   [[INT:%[0-9]+]]:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.loop.decrement), [[C]](s32)
   ; AIE2p-NEXT:   G_BRCOND [[INT]](s1), %bb.3
   ; AIE2p-NEXT:   G_BR %bb.2
diff --git a/llvm/test/CodeGen/AIE/hardware-loops/nested.ll b/llvm/test/CodeGen/AIE/hardware-loops/nested.ll
index 39aa8210245f..0a9498dd5504 100644
--- a/llvm/test/CodeGen/AIE/hardware-loops/nested.ll
+++ b/llvm/test/CodeGen/AIE/hardware-loops/nested.ll
@@ -22,45 +22,39 @@
 define void @nested(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef %size, i32 noundef %size2) {
 ; AIE2-LABEL: nested:
 ; AIE2:       // %bb.0: // %for.cond3.preheader.lr.ph
-; AIE2-NEXT:    nopb ; mova r3, #0; nops ; nopxm ; nopv
-; AIE2-NEXT:    mova r4, #2; nopx
-; AIE2-NEXT:    movxm p2, #.LBB0_2
+; AIE2-NEXT:    nopa ; nopb ; movxm p2, #.LBB0_2
 ; AIE2-NEXT:    lda r2, [p0, #0]
 ; AIE2-NEXT:  .LBB0_1: // %for.cond3.preheader
 ; AIE2-NEXT:    // =>This Loop Header: Depth=1
 ; AIE2-NEXT:    // Child Loop BB0_2 Depth 2
-; AIE2-NEXT:    nopa ; lshl r5, r3, r4; nopm
-; AIE2-NEXT:    mov dj0, r5
-; AIE2-NEXT:    lda p3, [p1, dj0]
+; AIE2-NEXT:    nopb ; lda p3, [p1, #0]; nops ; nopxm ; nopv
+; AIE2-NEXT:    nopx
 ; AIE2-NEXT:    nop
 ; AIE2-NEXT:    nop
 ; AIE2-NEXT:    nop
-; AIE2-NEXT:    mova r6, #0
-; AIE2-NEXT:    add.nc r5, r1, #-1
+; AIE2-NEXT:    nop
+; AIE2-NEXT:    add.nc r3, r1, #-1
 ; AIE2-NEXT:  .LBB0_2: // %for.body6
 ; AIE2-NEXT:    // Parent Loop BB0_1 Depth=1
 ; AIE2-NEXT:    // => This Inner Loop Header: Depth=2
-; AIE2-NEXT:    nopa ; lshl r7, r6, r4; nopm
-; AIE2-NEXT:    mov dj0, r7
-; AIE2-NEXT:    lda r7, [p3, dj0]
+; AIE2-NEXT:    lda r4, [p3], #4; nopx
 ; AIE2-NEXT:    nop
 ; AIE2-NEXT:    nop
-; AIE2-NEXT:    jnzd r5, r5, p2
+; AIE2-NEXT:    jnzd r3, r3, p2
 ; AIE2-NEXT:    nop // Delay Slot 5
 ; AIE2-NEXT:    nop // Delay Slot 4
-; AIE2-NEXT:    add r6, r6, #1 // Delay Slot 3
-; AIE2-NEXT:    add r2, r2, r7 // Delay Slot 2
+; AIE2-NEXT:    nop // Delay Slot 3
+; AIE2-NEXT:    add r2, r2, r4 // Delay Slot 2
 ; AIE2-NEXT:    st r2, [p0, #0] // Delay Slot 1
 ; AIE2-NEXT:  // %bb.3: // %for.cond3.for.cond.cleanup5_crit_edge
 ; AIE2-NEXT:    // in Loop: Header=BB0_1 Depth=1
-; AIE2-NEXT:    add r3, r3, #1
-; AIE2-NEXT:    eq r5, r0, r3
-; AIE2-NEXT:    jz r5, #.LBB0_1
+; AIE2-NEXT:    add r0, r0, #-1
+; AIE2-NEXT:    jnz r0, #.LBB0_1
 ; AIE2-NEXT:    nop // Delay Slot 5
 ; AIE2-NEXT:    nop // Delay Slot 4
 ; AIE2-NEXT:    nop // Delay Slot 3
 ; AIE2-NEXT:    nop // Delay Slot 2
-; AIE2-NEXT:    nop // Delay Slot 1
+; AIE2-NEXT:    paddb [p1], #4 // Delay Slot 1
 ; AIE2-NEXT:  // %bb.4: // %for.cond.cleanup
 ; AIE2-NEXT:    ret lr
 ; AIE2-NEXT:    nop // Delay Slot 5
@@ -71,44 +65,39 @@ define void @nested(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef
 ;
 ; AIE2P-LABEL: nested:
 ; AIE2P:       // %bb.0: // %for.cond3.preheader.lr.ph
-; AIE2P-NEXT:    mova r3, #0; nopb ; nops ; nopxm ; nopv
-; AIE2P-NEXT:    mova r4, #2; nopx
 ; AIE2P-NEXT:    movxm p2, #.LBB0_2
+; AIE2P-NEXT:    mova m0, #4; nopx
 ; AIE2P-NEXT:    lda r2, [p0, #0]
 ; AIE2P-NEXT:  .LBB0_1: // %for.cond3.preheader
 ; AIE2P-NEXT:    // =>This Loop Header: Depth=1
 ; AIE2P-NEXT:    // Child Loop BB0_2 Depth 2
-; AIE2P-NEXT:    nopa ; lshl r5, r3, r4; nopm
-; AIE2P-NEXT:    mov dj0, r5
-; AIE2P-NEXT:    lda p3, [p1, dj0]
+; AIE2P-NEXT:    lda p3, [p1, #0]; nopb ; nops ; nopxm ; nopv
+; AIE2P-NEXT:    nopx
 ; AIE2P-NEXT:    nop
 ; AIE2P-NEXT:    nop
 ; AIE2P-NEXT:    nop
-; AIE2P-NEXT:    mova r6, #0
-; AIE2P-NEXT:    add.nc r5, r1, #-1
+; AIE2P-NEXT:    nop
+; AIE2P-NEXT:    add.nc r3, r1, #-1
 ; AIE2P-NEXT:  .LBB0_2: // %for.body6
 ; AIE2P-NEXT:    // Parent Loop BB0_1 Depth=1
 ; AIE2P-NEXT:    // => This Inner Loop Header: Depth=2
-; AIE2P-NEXT:    nopa ; lshl r7, r6, r4; nopm
-; AIE2P-NEXT:    mov dj0, r7
-; AIE2P-NEXT:    lda r7, [p3, dj0]
+; AIE2P-NEXT:    lda r4, [p3], #4; nopx
 ; AIE2P-NEXT:    nop
 ; AIE2P-NEXT:    nop
-; AIE2P-NEXT:    jnzd r5, r5, p2
+; AIE2P-NEXT:    jnzd r3, r3, p2
 ; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
-; AIE2P-NEXT:    add r6, r6, #1 // Delay Slot 3
-; AIE2P-NEXT:    add r2, r2, r7 // Delay Slot 2
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    add r2, r2, r4 // Delay Slot 2
 ; AIE2P-NEXT:    st r2, [p0, #0] // Delay Slot 1
 ; AIE2P-NEXT:  // %bb.3: // %for.cond3.for.cond.cleanup5_crit_edge
 ; AIE2P-NEXT:    // in Loop: Header=BB0_1 Depth=1
-; AIE2P-NEXT:    add r3, r3, #1
-; AIE2P-NEXT:    eq r5, r0, r3
-; AIE2P-NEXT:    jz r5, #.LBB0_1
+; AIE2P-NEXT:    add r0, r0, #-1
+; AIE2P-NEXT:    jnz r0, #.LBB0_1
 ; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
 ; AIE2P-NEXT:    nop // Delay Slot 3
-; AIE2P-NEXT:    nop // Delay Slot 2
+; AIE2P-NEXT:    padda [p1], m0 // Delay Slot 2
 ; AIE2P-NEXT:    nop // Delay Slot 1
 ; AIE2P-NEXT:  // %bb.4: // %for.cond.cleanup
 ; AIE2P-NEXT:    ret lr
@@ -120,45 +109,40 @@ define void @nested(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef
 ;
 ; AIE2PS-LABEL: nested:
 ; AIE2PS:       // %bb.0: // %for.cond3.preheader.lr.ph
-; AIE2PS-NEXT:    mova r4, #0; nopb ; nops ; nopxm ; nopv
-; AIE2PS-NEXT:    mova r6, #2; nopx
 ; AIE2PS-NEXT:    movxm p2, #.LBB0_2
+; AIE2PS-NEXT:    mova m0, #4; nopx
 ; AIE2PS-NEXT:    lda r2, [p0, #0]
 ; AIE2PS-NEXT:  .LBB0_1: // %for.cond3.preheader
 ; AIE2PS-NEXT:    // =>This Loop Header: Depth=1
 ; AIE2PS-NEXT:    // Child Loop BB0_2 Depth 2
-; AIE2PS-NEXT:    nopa ; lshl r16, r4, r6; nopm
-; AIE2PS-NEXT:    mov dj0, r16
-; AIE2PS-NEXT:    lda p3, [p1, dj0]
+; AIE2PS-NEXT:    lda p3, [p1, #0]; nopb ; nops ; nopxm ; nopv
+; AIE2PS-NEXT:    nopx
+; AIE2PS-NEXT:    nop
 ; AIE2PS-NEXT:    nop
 ; AIE2PS-NEXT:    nop
 ; AIE2PS-NEXT:    nop
 ; AIE2PS-NEXT:    addm.nc r3, r1, #-1
-; AIE2PS-NEXT:    mova r16, #0
 ; AIE2PS-NEXT:  .LBB0_2: // %for.body6
 ; AIE2PS-NEXT:    // Parent Loop BB0_1 Depth=1
 ; AIE2PS-NEXT:    // => This Inner Loop Header: Depth=2
-; AIE2PS-NEXT:    nopa ; lshl r18, r16, r6; nopm
-; AIE2PS-NEXT:    mov dj0, r18
-; AIE2PS-NEXT:    lda r18, [p3, dj0]
+; AIE2PS-NEXT:    lda r4, [p3], #4; nopx
 ; AIE2PS-NEXT:    nop
 ; AIE2PS-NEXT:    nop
 ; AIE2PS-NEXT:    jnzd r3, r3, p2
 ; AIE2PS-NEXT:    nop // Delay Slot 5
 ; AIE2PS-NEXT:    nop // Delay Slot 4
-; AIE2PS-NEXT:    add r16, r16, #1 // Delay Slot 3
-; AIE2PS-NEXT:    add r2, r2, r18 // Delay Slot 2
+; AIE2PS-NEXT:    nop // Delay Slot 3
+; AIE2PS-NEXT:    add r2, r2, r4 // Delay Slot 2
 ; AIE2PS-NEXT:    st r2, [p0, #0] // Delay Slot 1
 ; AIE2PS-NEXT:  // %bb.3: // %for.cond3.for.cond.cleanup5_crit_edge
 ; AIE2PS-NEXT:    // in Loop: Header=BB0_1 Depth=1
-; AIE2PS-NEXT:    add r4, r4, #1
-; AIE2PS-NEXT:    eq r16, r0, r4
-; AIE2PS-NEXT:    jz r16, #.LBB0_1
+; AIE2PS-NEXT:    add r0, r0, #-1
+; AIE2PS-NEXT:    jnz r0, #.LBB0_1
 ; AIE2PS-NEXT:    nop // Delay Slot 5
 ; AIE2PS-NEXT:    nop // Delay Slot 4
 ; AIE2PS-NEXT:    nop // Delay Slot 3
 ; AIE2PS-NEXT:    nop // Delay Slot 2
-; AIE2PS-NEXT:    nop // Delay Slot 1
+; AIE2PS-NEXT:    padda [p1], m0 // Delay Slot 1
 ; AIE2PS-NEXT:  // %bb.4: // %for.cond.cleanup
 ; AIE2PS-NEXT:    ret lr
 ; AIE2PS-NEXT:    nop // Delay Slot 5
diff --git a/llvm/test/CodeGen/AIE/hardware-loops/sibling.ll b/llvm/test/CodeGen/AIE/hardware-loops/sibling.ll
index 3a5ea5cf9763..c1bff8247501 100644
--- a/llvm/test/CodeGen/AIE/hardware-loops/sibling.ll
+++ b/llvm/test/CodeGen/AIE/hardware-loops/sibling.ll
@@ -16,42 +16,35 @@
 define void @sibling(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef %size, i32 noundef %size2) {
 ; AIE2-LABEL: sibling:
 ; AIE2:       // %bb.0: // %for.body.lr.ph
-; AIE2-NEXT:    mova r2, #0; nopxm
-; AIE2-NEXT:    add.nc r0, r0, #-1
-; AIE2-NEXT:    mova r4, #2
-; AIE2-NEXT:    movxm p2, #.LBB0_1
-; AIE2-NEXT:    mova r5, #0
-; AIE2-NEXT:    lda r3, [p0, #0]
+; AIE2-NEXT:    nopb ; nopa ; nops ; nopx ; add.nc r0, r0, #-1; nopv
+; AIE2-NEXT:    nop ; movxm p2, #.LBB0_1
+; AIE2-NEXT:    mov p3, p1
+; AIE2-NEXT:    lda r2, [p0, #0]
 ; AIE2-NEXT:  .LBB0_1: // %for.body
 ; AIE2-NEXT:    // =>This Inner Loop Header: Depth=1
-; AIE2-NEXT:    nopa ; nopb ; lshl r6, r5, r4; nopm ; nops
-; AIE2-NEXT:    mov dj0, r6
-; AIE2-NEXT:    lda r6, [p1, dj0]
+; AIE2-NEXT:    lda r3, [p3], #4; nopb ; nopxm
 ; AIE2-NEXT:    nop
 ; AIE2-NEXT:    nop
 ; AIE2-NEXT:    jnzd r0, r0, p2
 ; AIE2-NEXT:    nop // Delay Slot 5
 ; AIE2-NEXT:    nop // Delay Slot 4
-; AIE2-NEXT:    add r5, r5, #1 // Delay Slot 3
-; AIE2-NEXT:    add r3, r3, r6 // Delay Slot 2
-; AIE2-NEXT:    st r3, [p0, #0] // Delay Slot 1
+; AIE2-NEXT:    nop // Delay Slot 3
+; AIE2-NEXT:    add r2, r2, r3 // Delay Slot 2
+; AIE2-NEXT:    st r2, [p0, #0] // Delay Slot 1
 ; AIE2-NEXT:  // %bb.2: // %for.body6.lr.ph
 ; AIE2-NEXT:    add.nc r1, r1, #-1
-; AIE2-NEXT:    mova r3, #2
 ; AIE2-NEXT:    movxm p2, #.LBB0_3
 ; AIE2-NEXT:    lda r0, [p0, #0]
 ; AIE2-NEXT:  .LBB0_3: // %for.body6
 ; AIE2-NEXT:    // =>This Inner Loop Header: Depth=1
-; AIE2-NEXT:    nopb ; nopa ; nops ; lshl r4, r2, r3; nopm ; nopv
-; AIE2-NEXT:    nopa ; mov dj0, r4
-; AIE2-NEXT:    lda r4, [p1, dj0]
+; AIE2-NEXT:    lda r2, [p1], #4; nopb ; nopxm
 ; AIE2-NEXT:    nop
 ; AIE2-NEXT:    nop
 ; AIE2-NEXT:    jnzd r1, r1, p2
 ; AIE2-NEXT:    nop // Delay Slot 5
 ; AIE2-NEXT:    nop // Delay Slot 4
-; AIE2-NEXT:    add r2, r2, #1 // Delay Slot 3
-; AIE2-NEXT:    add r0, r0, r4 // Delay Slot 2
+; AIE2-NEXT:    nop // Delay Slot 3
+; AIE2-NEXT:    add r0, r0, r2 // Delay Slot 2
 ; AIE2-NEXT:    st r0, [p0, #0] // Delay Slot 1
 ; AIE2-NEXT:  // %bb.4: // %for.cond.cleanup5
 ; AIE2-NEXT:    ret lr
@@ -63,42 +56,35 @@ define void @sibling(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef
 ;
 ; AIE2P-LABEL: sibling:
 ; AIE2P:       // %bb.0: // %for.body.lr.ph
-; AIE2P-NEXT:    mova r2, #0; nopxm
-; AIE2P-NEXT:    add.nc r0, r0, #-1
-; AIE2P-NEXT:    mova r4, #2
+; AIE2P-NEXT:    nopa ; nopb ; nops ; nopx ; add.nc r0, r0, #-1; nopv
 ; AIE2P-NEXT:    movxm p2, #.LBB0_1
-; AIE2P-NEXT:    mova r5, #0
-; AIE2P-NEXT:    lda r3, [p0, #0]
+; AIE2P-NEXT:    nopx ; mov p3, p1
+; AIE2P-NEXT:    lda r2, [p0, #0]
 ; AIE2P-NEXT:  .LBB0_1: // %for.body
 ; AIE2P-NEXT:    // =>This Inner Loop Header: Depth=1
-; AIE2P-NEXT:    nopa ; nopb ; lshl r6, r5, r4; nopm ; nops
-; AIE2P-NEXT:    mov dj0, r6
-; AIE2P-NEXT:    lda r6, [p1, dj0]
+; AIE2P-NEXT:    lda r3, [p3], #4; nopb ; nopxm
 ; AIE2P-NEXT:    nop
 ; AIE2P-NEXT:    nop
 ; AIE2P-NEXT:    jnzd r0, r0, p2
 ; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
-; AIE2P-NEXT:    add r5, r5, #1 // Delay Slot 3
-; AIE2P-NEXT:    add r3, r3, r6 // Delay Slot 2
-; AIE2P-NEXT:    st r3, [p0, #0] // Delay Slot 1
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    add r2, r2, r3 // Delay Slot 2
+; AIE2P-NEXT:    st r2, [p0, #0] // Delay Slot 1
 ; AIE2P-NEXT:  // %bb.2: // %for.body6.lr.ph
 ; AIE2P-NEXT:    add.nc r1, r1, #-1
-; AIE2P-NEXT:    mova r3, #2
 ; AIE2P-NEXT:    movxm p2, #.LBB0_3
 ; AIE2P-NEXT:    lda r0, [p0, #0]
 ; AIE2P-NEXT:  .LBB0_3: // %for.body6
 ; AIE2P-NEXT:    // =>This Inner Loop Header: Depth=1
-; AIE2P-NEXT:    nopa ; nopb ; nops ; lshl r4, r2, r3; nopm ; nopv
-; AIE2P-NEXT:    nopx ; mov dj0, r4
-; AIE2P-NEXT:    lda r4, [p1, dj0]
+; AIE2P-NEXT:    lda r2, [p1], #4; nopb ; nopxm
 ; AIE2P-NEXT:    nop
 ; AIE2P-NEXT:    nop
 ; AIE2P-NEXT:    jnzd r1, r1, p2
 ; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
-; AIE2P-NEXT:    add r2, r2, #1 // Delay Slot 3
-; AIE2P-NEXT:    add r0, r0, r4 // Delay Slot 2
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    add r0, r0, r2 // Delay Slot 2
 ; AIE2P-NEXT:    st r0, [p0, #0] // Delay Slot 1
 ; AIE2P-NEXT:  // %bb.4: // %for.cond.cleanup5
 ; AIE2P-NEXT:    ret lr
@@ -110,42 +96,35 @@ define void @sibling(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef
 ;
 ; AIE2PS-LABEL: sibling:
 ; AIE2PS:       // %bb.0: // %for.body.lr.ph
-; AIE2PS-NEXT:    mova r2, #0; nopxm
-; AIE2PS-NEXT:    addm.nc r3, r0, #-1
-; AIE2PS-NEXT:    mova r0, #2
+; AIE2PS-NEXT:    nopa ; nopb ; nops ; nopx ; addm.nc r3, r0, #-1; nopv
 ; AIE2PS-NEXT:    movxm p2, #.LBB0_1
-; AIE2PS-NEXT:    mova r6, #0
-; AIE2PS-NEXT:    lda r4, [p0, #0]
+; AIE2PS-NEXT:    nopx ; mov p3, p1
+; AIE2PS-NEXT:    lda r2, [p0, #0]
 ; AIE2PS-NEXT:  .LBB0_1: // %for.body
 ; AIE2PS-NEXT:    // =>This Inner Loop Header: Depth=1
-; AIE2PS-NEXT:    nopa ; nopb ; lshl r16, r6, r0; nopm ; nops
-; AIE2PS-NEXT:    mov dj0, r16
-; AIE2PS-NEXT:    lda r16, [p1, dj0]
+; AIE2PS-NEXT:    lda r0, [p3], #4; nopb ; nopxm
 ; AIE2PS-NEXT:    nop
 ; AIE2PS-NEXT:    nop
 ; AIE2PS-NEXT:    jnzd r3, r3, p2
 ; AIE2PS-NEXT:    nop // Delay Slot 5
 ; AIE2PS-NEXT:    nop // Delay Slot 4
-; AIE2PS-NEXT:    add r6, r6, #1 // Delay Slot 3
-; AIE2PS-NEXT:    add r4, r4, r16 // Delay Slot 2
-; AIE2PS-NEXT:    st r4, [p0, #0] // Delay Slot 1
+; AIE2PS-NEXT:    nop // Delay Slot 3
+; AIE2PS-NEXT:    add r2, r2, r0 // Delay Slot 2
+; AIE2PS-NEXT:    st r2, [p0, #0] // Delay Slot 1
 ; AIE2PS-NEXT:  // %bb.2: // %for.body6.lr.ph
 ; AIE2PS-NEXT:    addm.nc r1, r1, #-1
-; AIE2PS-NEXT:    mova r4, #2
 ; AIE2PS-NEXT:    movxm p2, #.LBB0_3
 ; AIE2PS-NEXT:    lda r0, [p0, #0]
 ; AIE2PS-NEXT:  .LBB0_3: // %for.body6
 ; AIE2PS-NEXT:    // =>This Inner Loop Header: Depth=1
-; AIE2PS-NEXT:    nopa ; nopb ; nops ; lshl r6, r2, r4; nopm ; nopv
-; AIE2PS-NEXT:    nopx ; mov dj0, r6
-; AIE2PS-NEXT:    lda r6, [p1, dj0]
+; AIE2PS-NEXT:    lda r2, [p1], #4; nopb ; nopxm
 ; AIE2PS-NEXT:    nop
 ; AIE2PS-NEXT:    nop
 ; AIE2PS-NEXT:    jnzd r1, r1, p2
 ; AIE2PS-NEXT:    nop // Delay Slot 5
 ; AIE2PS-NEXT:    nop // Delay Slot 4
-; AIE2PS-NEXT:    add r2, r2, #1 // Delay Slot 3
-; AIE2PS-NEXT:    add r0, r0, r6 // Delay Slot 2
+; AIE2PS-NEXT:    nop // Delay Slot 3
+; AIE2PS-NEXT:    add r0, r0, r2 // Delay Slot 2
 ; AIE2PS-NEXT:    st r0, [p0, #0] // Delay Slot 1
 ; AIE2PS-NEXT:  // %bb.4: // %for.cond.cleanup5
 ; AIE2PS-NEXT:    ret lr
diff --git a/llvm/test/CodeGen/AIE/hardware-loops/simple.ll b/llvm/test/CodeGen/AIE/hardware-loops/simple.ll
index 19be9728fcea..900a895568cb 100644
--- a/llvm/test/CodeGen/AIE/hardware-loops/simple.ll
+++ b/llvm/test/CodeGen/AIE/hardware-loops/simple.ll
@@ -16,23 +16,19 @@
 define void @simple(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef %size) {
 ; AIE2-LABEL: simple:
 ; AIE2:       // %bb.0: // %for.body.lr.ph
-; AIE2-NEXT:    mova r2, #0; nopb ; nopxm ; nops
-; AIE2-NEXT:    add.nc r0, r0, #-1
-; AIE2-NEXT:    mova r3, #2
+; AIE2-NEXT:    nopa ; add.nc r0, r0, #-1
 ; AIE2-NEXT:    movxm p2, #.LBB0_1
 ; AIE2-NEXT:    lda r1, [p0, #0]
 ; AIE2-NEXT:  .LBB0_1: // %for.body
 ; AIE2-NEXT:    // =>This Inner Loop Header: Depth=1
-; AIE2-NEXT:    nopb ; nopa ; nops ; lshl r4, r2, r3; nopm ; nopv
-; AIE2-NEXT:    nopa ; mov dj0, r4
-; AIE2-NEXT:    lda r4, [p1, dj0]
+; AIE2-NEXT:    lda r2, [p1], #4; nopb ; nopxm
 ; AIE2-NEXT:    nop
 ; AIE2-NEXT:    nop
 ; AIE2-NEXT:    jnzd r0, r0, p2
 ; AIE2-NEXT:    nop // Delay Slot 5
 ; AIE2-NEXT:    nop // Delay Slot 4
-; AIE2-NEXT:    add r2, r2, #1 // Delay Slot 3
-; AIE2-NEXT:    add r1, r1, r4 // Delay Slot 2
+; AIE2-NEXT:    nop // Delay Slot 3
+; AIE2-NEXT:    add r1, r1, r2 // Delay Slot 2
 ; AIE2-NEXT:    st r1, [p0, #0] // Delay Slot 1
 ; AIE2-NEXT:  // %bb.2: // %for.cond.cleanup
 ; AIE2-NEXT:    ret lr
@@ -44,23 +40,19 @@ define void @simple(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef
 ;
 ; AIE2P-LABEL: simple:
 ; AIE2P:       // %bb.0: // %for.body.lr.ph
-; AIE2P-NEXT:    mova r2, #0; nopb ; nopxm ; nops
-; AIE2P-NEXT:    add.nc r0, r0, #-1
-; AIE2P-NEXT:    mova r3, #2
+; AIE2P-NEXT:    nopx ; add.nc r0, r0, #-1
 ; AIE2P-NEXT:    movxm p2, #.LBB0_1
 ; AIE2P-NEXT:    lda r1, [p0, #0]
 ; AIE2P-NEXT:  .LBB0_1: // %for.body
 ; AIE2P-NEXT:    // =>This Inner Loop Header: Depth=1
-; AIE2P-NEXT:    nopa ; nopb ; nops ; lshl r4, r2, r3; nopm ; nopv
-; AIE2P-NEXT:    nopx ; mov dj0, r4
-; AIE2P-NEXT:    lda r4, [p1, dj0]
+; AIE2P-NEXT:    lda r2, [p1], #4; nopb ; nopxm
 ; AIE2P-NEXT:    nop
 ; AIE2P-NEXT:    nop
 ; AIE2P-NEXT:    jnzd r0, r0, p2
 ; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
-; AIE2P-NEXT:    add r2, r2, #1 // Delay Slot 3
-; AIE2P-NEXT:    add r1, r1, r4 // Delay Slot 2
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    add r1, r1, r2 // Delay Slot 2
 ; AIE2P-NEXT:    st r1, [p0, #0] // Delay Slot 1
 ; AIE2P-NEXT:  // %bb.2: // %for.cond.cleanup
 ; AIE2P-NEXT:    ret lr
@@ -72,23 +64,19 @@ define void @simple(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef
 ;
 ; AIE2PS-LABEL: simple:
 ; AIE2PS:       // %bb.0: // %for.body.lr.ph
-; AIE2PS-NEXT:    mova r4, #0; nopb ; nopxm ; nops
-; AIE2PS-NEXT:    addm.nc r1, r0, #-1
-; AIE2PS-NEXT:    mova r0, #2
+; AIE2PS-NEXT:    nopx ; addm.nc r1, r0, #-1
 ; AIE2PS-NEXT:    movxm p2, #.LBB0_1
 ; AIE2PS-NEXT:    lda r2, [p0, #0]
 ; AIE2PS-NEXT:  .LBB0_1: // %for.body
 ; AIE2PS-NEXT:    // =>This Inner Loop Header: Depth=1
-; AIE2PS-NEXT:    nopa ; nopb ; nops ; lshl r6, r4, r0; nopm ; nopv
-; AIE2PS-NEXT:    nopx ; mov dj0, r6
-; AIE2PS-NEXT:    lda r6, [p1, dj0]
+; AIE2PS-NEXT:    lda r0, [p1], #4; nopb ; nopxm
 ; AIE2PS-NEXT:    nop
 ; AIE2PS-NEXT:    nop
 ; AIE2PS-NEXT:    jnzd r1, r1, p2
 ; AIE2PS-NEXT:    nop // Delay Slot 5
 ; AIE2PS-NEXT:    nop // Delay Slot 4
-; AIE2PS-NEXT:    add r4, r4, #1 // Delay Slot 3
-; AIE2PS-NEXT:    add r2, r2, r6 // Delay Slot 2
+; AIE2PS-NEXT:    nop // Delay Slot 3
+; AIE2PS-NEXT:    add r2, r2, r0 // Delay Slot 2
 ; AIE2PS-NEXT:    st r2, [p0, #0] // Delay Slot 1
 ; AIE2PS-NEXT:  // %bb.2: // %for.cond.cleanup
 ; AIE2PS-NEXT:    ret lr
diff --git a/llvm/test/CodeGen/AIE/hardware-loops/unknown-tc.ll b/llvm/test/CodeGen/AIE/hardware-loops/unknown-tc.ll
index 7f8c966db823..c37dc3e5e401 100644
--- a/llvm/test/CodeGen/AIE/hardware-loops/unknown-tc.ll
+++ b/llvm/test/CodeGen/AIE/hardware-loops/unknown-tc.ll
@@ -4,41 +4,71 @@
 ; See https://llvm.org/LICENSE.txt for license information.
 ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 ;
-; (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates
+; (c) Copyright 2023-2026 Advanced Micro Devices, Inc. or its affiliates
 
-; RUN: llc -O2 -mtriple=aie2 --issue-limit=1 %s -o - | FileCheck %s
-; RUN: llc -O2 -mtriple=aie2p --issue-limit=1 %s -o - | FileCheck %s
+; RUN: llc -O2 -mtriple=aie2 --issue-limit=1 %s -o - | FileCheck %s --check-prefix=AIE2
+; RUN: llc -O2 -mtriple=aie2p --issue-limit=1 %s -o - | FileCheck %s --check-prefix=AIE2P
 
 define void @cbz_exit(ptr %in, ptr %res) {
-; CHECK-LABEL: cbz_exit:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mova r0, #-1; nopb ; nopxm
-; CHECK-NEXT:    mova r1, #2
-; CHECK-NEXT:  .LBB0_1: // %loop
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add r0, r0, #1
-; CHECK-NEXT:    lshl r2, r0, r1
-; CHECK-NEXT:    mov dj0, r2
-; CHECK-NEXT:    lda r2, [p0, dj0]
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    jnz r2, #.LBB0_1
-; CHECK-NEXT:    nop // Delay Slot 5
-; CHECK-NEXT:    nop // Delay Slot 4
-; CHECK-NEXT:    nop // Delay Slot 3
-; CHECK-NEXT:    nop // Delay Slot 2
-; CHECK-NEXT:    nop // Delay Slot 1
-; CHECK-NEXT:  // %bb.2: // %exit
-; CHECK-NEXT:    ret lr
-; CHECK-NEXT:    nop // Delay Slot 5
-; CHECK-NEXT:    nop // Delay Slot 4
-; CHECK-NEXT:    nop // Delay Slot 3
-; CHECK-NEXT:    st r0, [p1, #0] // Delay Slot 2
-; CHECK-NEXT:    nop // Delay Slot 1
+; AIE2-LABEL: cbz_exit:
+; AIE2:       // %bb.0: // %entry
+; AIE2-NEXT:    mova r0, #-1; nopb ; nopxm
+; AIE2-NEXT:    mova r1, #2
+; AIE2-NEXT:  .LBB0_1: // %loop
+; AIE2-NEXT:    // =>This Inner Loop Header: Depth=1
+; AIE2-NEXT:    nopa ; nopb ; add r0, r0, #1
+; AIE2-NEXT:    lshl r2, r0, r1
+; AIE2-NEXT:    mov dj0, r2
+; AIE2-NEXT:    lda r2, [p0, dj0]
+; AIE2-NEXT:    nop
+; AIE2-NEXT:    nop
+; AIE2-NEXT:    nop
+; AIE2-NEXT:    nop
+; AIE2-NEXT:    nop
+; AIE2-NEXT:    nop
+; AIE2-NEXT:    jnz r2, #.LBB0_1
+; AIE2-NEXT:    nop // Delay Slot 5
+; AIE2-NEXT:    nop // Delay Slot 4
+; AIE2-NEXT:    nop // Delay Slot 3
+; AIE2-NEXT:    nop // Delay Slot 2
+; AIE2-NEXT:    nop // Delay Slot 1
+; AIE2-NEXT:  // %bb.2: // %exit
+; AIE2-NEXT:    ret lr
+; AIE2-NEXT:    nop // Delay Slot 5
+; AIE2-NEXT:    nop // Delay Slot 4
+; AIE2-NEXT:    nop // Delay Slot 3
+; AIE2-NEXT:    st r0, [p1, #0] // Delay Slot 2
+; AIE2-NEXT:    nop // Delay Slot 1
+;
+; AIE2P-LABEL: cbz_exit:
+; AIE2P:       // %bb.0: // %entry
+; AIE2P-NEXT:    mova r0, #-1; nopb ; nopxm
+; AIE2P-NEXT:    mova r1, #2
+; AIE2P-NEXT:  .LBB0_1: // %loop
+; AIE2P-NEXT:    // =>This Inner Loop Header: Depth=1
+; AIE2P-NEXT:    nopa ; nopb ; add r0, r0, #1
+; AIE2P-NEXT:    lshl r2, r0, r1
+; AIE2P-NEXT:    mov dj0, r2
+; AIE2P-NEXT:    lda r2, [p0, dj0]
+; AIE2P-NEXT:    nop
+; AIE2P-NEXT:    nop
+; AIE2P-NEXT:    nop
+; AIE2P-NEXT:    nop
+; AIE2P-NEXT:    nop
+; AIE2P-NEXT:    nop
+; AIE2P-NEXT:    jnz r2, #.LBB0_1
+; AIE2P-NEXT:    nop // Delay Slot 5
+; AIE2P-NEXT:    nop // Delay Slot 4
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    nop // Delay Slot 2
+; AIE2P-NEXT:    nop // Delay Slot 1
+; AIE2P-NEXT:  // %bb.2: // %exit
+; AIE2P-NEXT:    ret lr
+; AIE2P-NEXT:    nop // Delay Slot 5
+; AIE2P-NEXT:    nop // Delay Slot 4
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    st r0, [p1, #0] // Delay Slot 2
+; AIE2P-NEXT:    nop // Delay Slot 1
 entry:
   br label %loop
 
@@ -56,35 +86,65 @@ exit:
 }
 
 define void @cbnz_exit(ptr %in, ptr %res) {
-; CHECK-LABEL: cbnz_exit:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mova r0, #-1; nopb ; nopxm
-; CHECK-NEXT:    mova r1, #2
-; CHECK-NEXT:  .LBB1_1: // %loop
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add r0, r0, #1
-; CHECK-NEXT:    lshl r2, r0, r1
-; CHECK-NEXT:    mov dj0, r2
-; CHECK-NEXT:    lda r2, [p0, dj0]
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    jz r2, #.LBB1_1
-; CHECK-NEXT:    nop // Delay Slot 5
-; CHECK-NEXT:    nop // Delay Slot 4
-; CHECK-NEXT:    nop // Delay Slot 3
-; CHECK-NEXT:    nop // Delay Slot 2
-; CHECK-NEXT:    nop // Delay Slot 1
-; CHECK-NEXT:  // %bb.2: // %exit
-; CHECK-NEXT:    ret lr
-; CHECK-NEXT:    nop // Delay Slot 5
-; CHECK-NEXT:    nop // Delay Slot 4
-; CHECK-NEXT:    nop // Delay Slot 3
-; CHECK-NEXT:    st r0, [p1, #0] // Delay Slot 2
-; CHECK-NEXT:    nop // Delay Slot 1
+; AIE2-LABEL: cbnz_exit:
+; AIE2:       // %bb.0: // %entry
+; AIE2-NEXT:    mova r0, #-1; nopb ; nopxm
+; AIE2-NEXT:    mova r1, #2
+; AIE2-NEXT:  .LBB1_1: // %loop
+; AIE2-NEXT:    // =>This Inner Loop Header: Depth=1
+; AIE2-NEXT:    nopa ; nopb ; add r0, r0, #1
+; AIE2-NEXT:    lshl r2, r0, r1
+; AIE2-NEXT:    mov dj0, r2
+; AIE2-NEXT:    lda r2, [p0, dj0]
+; AIE2-NEXT:    nop
+; AIE2-NEXT:    nop
+; AIE2-NEXT:    nop
+; AIE2-NEXT:    nop
+; AIE2-NEXT:    nop
+; AIE2-NEXT:    nop
+; AIE2-NEXT:    jz r2, #.LBB1_1
+; AIE2-NEXT:    nop // Delay Slot 5
+; AIE2-NEXT:    nop // Delay Slot 4
+; AIE2-NEXT:    nop // Delay Slot 3
+; AIE2-NEXT:    nop // Delay Slot 2
+; AIE2-NEXT:    nop // Delay Slot 1
+; AIE2-NEXT:  // %bb.2: // %exit
+; AIE2-NEXT:    ret lr
+; AIE2-NEXT:    nop // Delay Slot 5
+; AIE2-NEXT:    nop // Delay Slot 4
+; AIE2-NEXT:    nop // Delay Slot 3
+; AIE2-NEXT:    st r0, [p1, #0] // Delay Slot 2
+; AIE2-NEXT:    nop // Delay Slot 1
+;
+; AIE2P-LABEL: cbnz_exit:
+; AIE2P:       // %bb.0: // %entry
+; AIE2P-NEXT:    mova r0, #-1; nopb ; nopxm
+; AIE2P-NEXT:    mova r1, #2
+; AIE2P-NEXT:  .LBB1_1: // %loop
+; AIE2P-NEXT:    // =>This Inner Loop Header: Depth=1
+; AIE2P-NEXT:    nopa ; nopb ; add r0, r0, #1
+; AIE2P-NEXT:    lshl r2, r0, r1
+; AIE2P-NEXT:    mov dj0, r2
+; AIE2P-NEXT:    lda r2, [p0, dj0]
+; AIE2P-NEXT:    nop
+; AIE2P-NEXT:    nop
+; AIE2P-NEXT:    nop
+; AIE2P-NEXT:    nop
+; AIE2P-NEXT:    nop
+; AIE2P-NEXT:    nop
+; AIE2P-NEXT:    jz r2, #.LBB1_1
+; AIE2P-NEXT:    nop // Delay Slot 5
+; AIE2P-NEXT:    nop // Delay Slot 4
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    nop // Delay Slot 2
+; AIE2P-NEXT:    nop // Delay Slot 1
+; AIE2P-NEXT:  // %bb.2: // %exit
+; AIE2P-NEXT:    ret lr
+; AIE2P-NEXT:    nop // Delay Slot 5
+; AIE2P-NEXT:    nop // Delay Slot 4
+; AIE2P-NEXT:    nop // Delay Slot 3
+; AIE2P-NEXT:    st r0, [p1, #0] // Delay Slot 2
+; AIE2P-NEXT:    nop // Delay Slot 1
 entry:
   br label %loop
 
diff --git a/llvm/test/CodeGen/AIE/hardware-loops/zol-loop.ll b/llvm/test/CodeGen/AIE/hardware-loops/zol-loop.ll
index 7c90718c95ac..6fe05c87c3ee 100644
--- a/llvm/test/CodeGen/AIE/hardware-loops/zol-loop.ll
+++ b/llvm/test/CodeGen/AIE/hardware-loops/zol-loop.ll
@@ -16,9 +16,9 @@
 define void @simple_loop(i32 noundef %n, ptr nocapture readonly %in, ptr nocapture writeonly %out) {
 ; AIE2-LABEL: simple_loop:
 ; AIE2:       // %bb.0: // %entry
-; AIE2-NEXT:    mova r1, #0
-; AIE2-NEXT:    ge r2, r1, r0
-; AIE2-NEXT:    jnz r2, #.LBB0_3
+; AIE2-NEXT:    mova r1, #0; nopb ; nopx
+; AIE2-NEXT:    ge r1, r1, r0
+; AIE2-NEXT:    jnz r1, #.LBB0_3
 ; AIE2-NEXT:    nop // Delay Slot 5
 ; AIE2-NEXT:    nop // Delay Slot 4
 ; AIE2-NEXT:    nop // Delay Slot 3
@@ -26,20 +26,20 @@ define void @simple_loop(i32 noundef %n, ptr nocapture readonly %in, ptr nocaptu
 ; AIE2-NEXT:    nop // Delay Slot 1
 ; AIE2-NEXT:  // %bb.1: // %for.body.preheader
 ; AIE2-NEXT:    add.nc lc, r0, #0
-; AIE2-NEXT:    mova r2, #1; movxm ls, #.LBB0_2
-; AIE2-NEXT:    mova r0, #2; movxm le, #.L_LEnd0
+; AIE2-NEXT:    movxm ls, #.LBB0_2
+; AIE2-NEXT:    mova r1, #1; movxm le, #.L_LEnd0
 ; AIE2-NEXT:  .LBB0_2: // %for.body
 ; AIE2-NEXT:    // =>This Inner Loop Header: Depth=1
-; AIE2-NEXT:    nopb ; lda r3, [p0, #0]; nops ; nopxm ; nopv
+; AIE2-NEXT:    nopb ; lda r0, [p0, #0]; nops ; nopxm ; nopv
 ; AIE2-NEXT:    nopb ; nopa ; nops ; nopxm ; nopv
 ; AIE2-NEXT:    nopb ; nopa ; nops ; nopxm ; nopv
 ; AIE2-NEXT:    nopb ; nopa ; nops ; nopxm ; nopv
 ; AIE2-NEXT:    nopb ; nopa ; nops ; nopxm ; nopv
-; AIE2-NEXT:    nopb ; nopa ; nops ; lshl r4, r1, r0; nopm ; nopv
-; AIE2-NEXT:    nopa ; nopb ; add r1, r1, #1
-; AIE2-NEXT:    add r3, r2, r3; mov dj0, r4
+; AIE2-NEXT:    nopb ; nopa ; nops ; nopxm ; nopv
+; AIE2-NEXT:    nopa ; nopb ; nopxm
+; AIE2-NEXT:    add r0, r1, r0
 ; AIE2-NEXT:  .L_LEnd0:
-; AIE2-NEXT:    nopb ; nopa ; st r3, [p1, dj0]; add r2, r2, #-1; nopm ; nopv
+; AIE2-NEXT:    nopb ; nopa ; st r0, [p1], #4; add r1, r1, #-1; nopm ; nopv
 ; AIE2-NEXT:  .LBB0_3: // %for.cond.cleanup
 ; AIE2-NEXT:    nopa ; ret lr
 ; AIE2-NEXT:    nop // Delay Slot 5
@@ -50,9 +50,9 @@ define void @simple_loop(i32 noundef %n, ptr nocapture readonly %in, ptr nocaptu
 ;
 ; AIE2P-LABEL: simple_loop:
 ; AIE2P:       // %bb.0: // %entry
-; AIE2P-NEXT:    mova r1, #0
-; AIE2P-NEXT:    ge r2, r1, r0
-; AIE2P-NEXT:    jnz r2, #.LBB0_3
+; AIE2P-NEXT:    mova r1, #0; nopb ; nopx
+; AIE2P-NEXT:    ge r1, r1, r0
+; AIE2P-NEXT:    jnz r1, #.LBB0_3
 ; AIE2P-NEXT:    nop // Delay Slot 5
 ; AIE2P-NEXT:    nop // Delay Slot 4
 ; AIE2P-NEXT:    nop // Delay Slot 3
@@ -60,20 +60,20 @@ define void @simple_loop(i32 noundef %n, ptr nocapture readonly %in, ptr nocaptu
 ; AIE2P-NEXT:    nop // Delay Slot 1
 ; AIE2P-NEXT:  // %bb.1: // %for.body.preheader
 ; AIE2P-NEXT:    add.nc lc, r0, #0
-; AIE2P-NEXT:    mova r2, #1; movxm ls, #.LBB0_2
-; AIE2P-NEXT:    mova r0, #2; movxm le, #.L_LEnd0
+; AIE2P-NEXT:    movxm ls, #.LBB0_2
+; AIE2P-NEXT:    mova r1, #1; movxm le, #.L_LEnd0
 ; AIE2P-NEXT:  .LBB0_2: // %for.body
 ; AIE2P-NEXT:    // =>This Inner Loop Header: Depth=1
-; AIE2P-NEXT:    lda r3, [p0, #0]; nopb ; nops ; nopxm ; nopv
+; AIE2P-NEXT:    lda r0, [p0, #0]; nopb ; nops ; nopxm ; nopv
+; AIE2P-NEXT:    nopa ; nopb ; nops ; nopxm ; nopv
 ; AIE2P-NEXT:    nopa ; nopb ; nops ; nopxm ; nopv
 ; AIE2P-NEXT:    nopa ; nopb ; nops ; nopxm ; nopv
 ; AIE2P-NEXT:    nopa ; nopb ; nops ; nopxm ; nopv
 ; AIE2P-NEXT:    nopa ; nopb ; nops ; nopxm ; nopv
-; AIE2P-NEXT:    nopa ; nopb ; nops ; lshl r4, r1, r0; nopm ; nopv
-; AIE2P-NEXT:    nopa ; add r1, r1, #1; nopm
-; AIE2P-NEXT:    add r3, r2, r3; mov dj0, r4
+; AIE2P-NEXT:    nopa ; nopb ; nopxm
+; AIE2P-NEXT:    add r0, r1, r0
 ; AIE2P-NEXT:  .L_LEnd0:
-; AIE2P-NEXT:    nopa ; nopb ; st r3, [p1, dj0]; add r2, r2, #-1; nopm ; nopv
+; AIE2P-NEXT:    nopa ; nopb ; st r0, [p1], #4; add r1, r1, #-1; nopm ; nopv
 ; AIE2P-NEXT:  .LBB0_3: // %for.cond.cleanup
 ; AIE2P-NEXT:    nopa ; ret lr
 ; AIE2P-NEXT:    nop // Delay Slot 5
@@ -84,9 +84,9 @@ define void @simple_loop(i32 noundef %n, ptr nocapture readonly %in, ptr nocaptu
 ;
 ; AIE2PS-LABEL: simple_loop:
 ; AIE2PS:       // %bb.0: // %entry
-; AIE2PS-NEXT:    mova r2, #0
-; AIE2PS-NEXT:    ge r4, r2, r0
-; AIE2PS-NEXT:    jnz r4, #.LBB0_3
+; AIE2PS-NEXT:    mova r2, #0; nopb ; nopx
+; AIE2PS-NEXT:    ge r2, r2, r0
+; AIE2PS-NEXT:    jnz r2, #.LBB0_3
 ; AIE2PS-NEXT:    nop // Delay Slot 5
 ; AIE2PS-NEXT:    nop // Delay Slot 4
 ; AIE2PS-NEXT:    nop // Delay Slot 3
@@ -94,20 +94,20 @@ define void @simple_loop(i32 noundef %n, ptr nocapture readonly %in, ptr nocaptu
 ; AIE2PS-NEXT:    nop // Delay Slot 1
 ; AIE2PS-NEXT:  // %bb.1: // %for.body.preheader
 ; AIE2PS-NEXT:    add.nc lc, r0, #0
-; AIE2PS-NEXT:    mova r4, #1; movxm ls, #.LBB0_2
-; AIE2PS-NEXT:    mova r0, #2; movxm le, #.L_LEnd0
+; AIE2PS-NEXT:    movxm ls, #.LBB0_2
+; AIE2PS-NEXT:    mova r2, #1; movxm le, #.L_LEnd0
 ; AIE2PS-NEXT:  .LBB0_2: // %for.body
 ; AIE2PS-NEXT:    // =>This Inner Loop Header: Depth=1
-; AIE2PS-NEXT:    lda r6, [p0, #0]; nopb ; nops ; nopxm ; nopv
+; AIE2PS-NEXT:    lda r0, [p0, #0]; nopb ; nops ; nopxm ; nopv
+; AIE2PS-NEXT:    nopa ; nopb ; nops ; nopxm ; nopv
 ; AIE2PS-NEXT:    nopa ; nopb ; nops ; nopxm ; nopv
 ; AIE2PS-NEXT:    nopa ; nopb ; nops ; nopxm ; nopv
 ; AIE2PS-NEXT:    nopa ; nopb ; nops ; nopxm ; nopv
 ; AIE2PS-NEXT:    nopa ; nopb ; nops ; nopxm ; nopv
-; AIE2PS-NEXT:    nopa ; nopb ; nops ; lshl r16, r2, r0; nopm ; nopv
-; AIE2PS-NEXT:    nopa ; add r2, r2, #1; nopm
-; AIE2PS-NEXT:    add r6, r4, r6; mov dj0, r16
+; AIE2PS-NEXT:    nopa ; nopb ; nopxm
+; AIE2PS-NEXT:    add r0, r2, r0
 ; AIE2PS-NEXT:  .L_LEnd0:
-; AIE2PS-NEXT:    nopa ; nopb ; st r6, [p1, dj0]; add r4, r4, #-1; nopm ; nopv
+; AIE2PS-NEXT:    nopa ; nopb ; st r0, [p1], #4; add r2, r2, #-1; nopm ; nopv
 ; AIE2PS-NEXT:  .LBB0_3: // %for.cond.cleanup
 ; AIE2PS-NEXT:    nopa ; ret lr
 ; AIE2PS-NEXT:    nop // Delay Slot 5
diff --git a/llvm/test/CodeGen/AIE/opt/lsr-i20-scalar-recurrence.ll b/llvm/test/CodeGen/AIE/opt/lsr-i20-scalar-recurrence.ll
new file mode 100644
index 000000000000..2c63728e3767
--- /dev/null
+++ b/llvm/test/CodeGen/AIE/opt/lsr-i20-scalar-recurrence.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+;
+; This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+; See https://llvm.org/LICENSE.txt for license information.
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+;
+; (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates
+
+; RUN: opt -mtriple=aie2p -passes='print<iv-users>' -disable-output %s 2>&1 | FileCheck %s --check-prefix=IVUSERS
+; RUN: opt -mtriple=aie2p -passes=loop-reduce -S %s | FileCheck %s --check-prefix=LSR
+
+; This test verifies that LSR looks through truncs to collect GEP results as
+; IV users on AIE. The pattern is derived from a post_process kernel where:
+; - Array indices are computed as trunc(4 * i + offset) to i20
+; - GEPs use these i20 indices with large element types (<32 x float>)
+;
+; With the IVUsers fix to look through truncs, LSR now:
+; 1. Collects GEP results (not trunc results) as IV users
+; 2. Gets pointer-typed SCEVs like {%src,+,512} instead of i20 {0,+,4}
+; 3. Creates pointer PHIs with byte-indexed GEPs for post-increment
+
+target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
+target triple = "aie2p"
+
+; Check that GEP results (pointers) are collected as IV users
+; IVUSERS: IV Users for loop %for.body with backedge-taken count
+; IVUSERS: %ptr0 = {%src,+,512}<%for.body>
+
+; LSR should create pointer PHIs with byte strides
+
+define void @post_process_pattern(ptr nocapture %src, i32 noundef %len) {
+; LSR-LABEL: define void @post_process_pattern(
+; LSR-SAME: ptr nocapture [[SRC:%.*]], i32 noundef [[LEN:%.*]]) {
+; LSR-NEXT:  [[ENTRY:.*]]:
+; LSR-NEXT:    [[DIV:%.*]] = lshr i32 [[LEN]], 7
+; LSR-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[LEN]], 511
+; LSR-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; LSR-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC]], i20 128
+; LSR-NEXT:    [[SCEVGEP6:%.*]] = getelementptr i8, ptr [[SRC]], i20 256
+; LSR-NEXT:    br label %[[FOR_BODY:.*]]
+; LSR:       [[FOR_COND_CLEANUP:.*]]:
+; LSR-NEXT:    ret void
+; LSR:       [[FOR_BODY]]:
+; LSR-NEXT:    [[LSR_IV7:%.*]] = phi ptr [ [[SCEVGEP8:%.*]], %[[FOR_BODY]] ], [ [[SCEVGEP6]], %[[ENTRY]] ]
+; LSR-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], %[[FOR_BODY]] ], [ [[SCEVGEP]], %[[ENTRY]] ]
+; LSR-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[DIV]], %[[ENTRY]] ]
+; LSR-NEXT:    [[SCEVGEP10:%.*]] = getelementptr i8, ptr [[LSR_IV7]], i20 -256
+; LSR-NEXT:    [[V0:%.*]] = load <32 x float>, ptr [[SCEVGEP10]], align 64
+; LSR-NEXT:    [[R0:%.*]] = tail call <32 x bfloat> @llvm.aie2p.v32accfloat.to.v32bf16(<32 x float> [[V0]])
+; LSR-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[LSR_IV1]], i20 -128
+; LSR-NEXT:    store <32 x bfloat> [[R0]], ptr [[SCEVGEP3]], align 64
+; LSR-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i8, ptr [[LSR_IV7]], i20 -128
+; LSR-NEXT:    [[V1:%.*]] = load <32 x float>, ptr [[SCEVGEP11]], align 64
+; LSR-NEXT:    [[R1:%.*]] = tail call <32 x bfloat> @llvm.aie2p.v32accfloat.to.v32bf16(<32 x float> [[V1]])
+; LSR-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[LSR_IV1]], i20 -64
+; LSR-NEXT:    store <32 x bfloat> [[R1]], ptr [[SCEVGEP5]], align 64
+; LSR-NEXT:    [[V2:%.*]] = load <32 x float>, ptr [[LSR_IV7]], align 64
+; LSR-NEXT:    [[R2:%.*]] = tail call <32 x bfloat> @llvm.aie2p.v32accfloat.to.v32bf16(<32 x float> [[V2]])
+; LSR-NEXT:    store <32 x bfloat> [[R2]], ptr [[LSR_IV1]], align 64
+; LSR-NEXT:    [[SCEVGEP9:%.*]] = getelementptr i8, ptr [[LSR_IV7]], i20 128
+; LSR-NEXT:    [[V3:%.*]] = load <32 x float>, ptr [[SCEVGEP9]], align 64
+; LSR-NEXT:    [[R3:%.*]] = tail call <32 x bfloat> @llvm.aie2p.v32accfloat.to.v32bf16(<32 x float> [[V3]])
+; LSR-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[LSR_IV1]], i20 64
+; LSR-NEXT:    store <32 x bfloat> [[R3]], ptr [[SCEVGEP4]], align 64
+; LSR-NEXT:    [[LSR_IV_NEXT]] = add nsw i32 [[LSR_IV]], -1
+; LSR-NEXT:    [[SCEVGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i20 256
+; LSR-NEXT:    [[SCEVGEP8]] = getelementptr i8, ptr [[LSR_IV7]], i20 512
+; LSR-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 0
+; LSR-NEXT:    br i1 [[EXITCOND]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+;
+entry:
+  %div = lshr i32 %len, 7
+  %cmp = icmp sgt i32 %len, 511
+  tail call void @llvm.assume(i1 %cmp)
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+
+  ; Compute base index: 4 * i
+  %mul = shl nsw i32 %i, 2
+  %idx0 = trunc i32 %mul to i20
+
+  ; Load at offset 0
+  %ptr0 = getelementptr inbounds <32 x float>, ptr %src, i20 %idx0
+  %v0 = load <32 x float>, ptr %ptr0, align 64
+  %r0 = tail call <32 x bfloat> @llvm.aie2p.v32accfloat.to.v32bf16(<32 x float> %v0)
+  %dst0 = getelementptr inbounds <32 x bfloat>, ptr %src, i20 %idx0
+  store <32 x bfloat> %r0, ptr %dst0, align 64
+
+  ; Load at offset 1
+  %add1 = or disjoint i32 %mul, 1
+  %idx1 = trunc i32 %add1 to i20
+  %ptr1 = getelementptr inbounds <32 x float>, ptr %src, i20 %idx1
+  %v1 = load <32 x float>, ptr %ptr1, align 64
+  %r1 = tail call <32 x bfloat> @llvm.aie2p.v32accfloat.to.v32bf16(<32 x float> %v1)
+  %dst1 = getelementptr inbounds <32 x bfloat>, ptr %src, i20 %idx1
+  store <32 x bfloat> %r1, ptr %dst1, align 64
+
+  ; Load at offset 2
+  %add2 = or disjoint i32 %mul, 2
+  %idx2 = trunc i32 %add2 to i20
+  %ptr2 = getelementptr inbounds <32 x float>, ptr %src, i20 %idx2
+  %v2 = load <32 x float>, ptr %ptr2, align 64
+  %r2 = tail call <32 x bfloat> @llvm.aie2p.v32accfloat.to.v32bf16(<32 x float> %v2)
+  %dst2 = getelementptr inbounds <32 x bfloat>, ptr %src, i20 %idx2
+  store <32 x bfloat> %r2, ptr %dst2, align 64
+
+  ; Load at offset 3
+  %add3 = or disjoint i32 %mul, 3
+  %idx3 = trunc i32 %add3 to i20
+  %ptr3 = getelementptr inbounds <32 x float>, ptr %src, i20 %idx3
+  %v3 = load <32 x float>, ptr %ptr3, align 64
+  %r3 = tail call <32 x bfloat> @llvm.aie2p.v32accfloat.to.v32bf16(<32 x float> %v3)
+  %dst3 = getelementptr inbounds <32 x bfloat>, ptr %src, i20 %idx3
+  store <32 x bfloat> %r3, ptr %dst3, align 64
+
+  %inc = add nuw nsw i32 %i, 1
+  %exitcond = icmp eq i32 %inc, %div
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !0
+}
+
+declare <32 x bfloat> @llvm.aie2p.v32accfloat.to.v32bf16(<32 x float>)
+declare void @llvm.assume(i1 noundef)
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.itercount.range", i64 4}
+;.
+; LSR: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; LSR: [[META1]] = !{!"llvm.loop.itercount.range", i64 4}
+;.
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; IVUSERS: {{.*}}
diff --git a/llvm/test/CodeGen/AIE/opt/lsr-nested-loop-non-dominating.ll b/llvm/test/CodeGen/AIE/opt/lsr-nested-loop-non-dominating.ll
new file mode 100644
index 000000000000..6fef1abde0b9
--- /dev/null
+++ b/llvm/test/CodeGen/AIE/opt/lsr-nested-loop-non-dominating.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+;
+; This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+; See https://llvm.org/LICENSE.txt for license information.
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+;
+; (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates
+
+; RUN: opt -mtriple=aie2p -passes=loop-reduce -S %s | FileCheck %s
+
+; This test demonstrates a regression pattern where LSR creates pointer PHIs
+; in the outer loop header for GEPs that are only used in the inner loop.
+;
+; Problem: The inner loop GEPs don't dominate the outer loop latch, but LSR
+; creates pointer recurrences for them anyway. This causes:
+;   1. Multiple pointer PHIs in outer loop header
+;   2. Expensive padda/paddb/padds updates in outer loop latch
+;   3. Unconditional pointer updates even when inner loop wasn't entered
+;
+; Expected: LSR should use scalar index recurrence + indexed addressing,
+; keeping pointer computation where it's actually used.
+;
+; Reference (good): vldb x8, [p3, dj1]  with  add r27, r27, r24  in latch
+; Regressed (bad):  vldb x8, [p6], #64  with  padda [p3], m1; padda [p7], m1  in latch
+
+target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
+target triple = "aie2p"
+
+; Test: Nested loop where inner loop GEPs should NOT create outer loop pointer PHIs
+;
+; The inner loop should use scalar i20 recurrence + indexed addressing,
+; not pointer recurrence which would require expensive updates in outer latch.
+;
+define void @nested_loop_non_dominating(ptr %base, i32 %outer_n, i32 %inner_n, i32 %stride) {
+; CHECK-LABEL: define void @nested_loop_non_dominating(
+; CHECK-SAME: ptr [[BASE:%.*]], i32 [[OUTER_N:%.*]], i32 [[INNER_N:%.*]], i32 [[STRIDE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[STRIDE20:%.*]] = trunc i32 [[STRIDE]] to i20
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[STRIDE]] to i20
+; CHECK-NEXT:    br label %[[OUTER_HEADER:.*]]
+; CHECK:       [[OUTER_HEADER]]:
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[OUTER_LATCH:.*]] ], [ [[BASE]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[OUTER_I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[OUTER_NEXT:%.*]], %[[OUTER_LATCH]] ]
+; CHECK-NEXT:    [[OUTER_CMP:%.*]] = icmp slt i32 [[OUTER_I]], [[OUTER_N]]
+; CHECK-NEXT:    br i1 [[OUTER_CMP]], label %[[INNER_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[INNER_PREHEADER]]:
+; CHECK-NEXT:    br label %[[INNER_HEADER:.*]]
+; CHECK:       [[INNER_HEADER]]:
+; CHECK-NEXT:    [[INNER_PTR:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], %[[INNER_HEADER]] ], [ [[LSR_IV]], %[[INNER_PREHEADER]] ]
+; CHECK-NEXT:    [[INNER_I:%.*]] = phi i32 [ 0, %[[INNER_PREHEADER]] ], [ [[INNER_NEXT:%.*]], %[[INNER_HEADER]] ]
+; CHECK-NEXT:    [[VAL:%.*]] = load <32 x i16>, ptr [[INNER_PTR]], align 64
+; CHECK-NEXT:    call void @consume(<32 x i16> [[VAL]])
+; CHECK-NEXT:    [[INNER_NEXT]] = add i32 [[INNER_I]], 1
+; CHECK-NEXT:    [[SCEVGEP2]] = getelementptr i8, ptr [[INNER_PTR]], i20 64
+; CHECK-NEXT:    [[INNER_CMP:%.*]] = icmp slt i32 [[INNER_NEXT]], [[INNER_N]]
+; CHECK-NEXT:    br i1 [[INNER_CMP]], label %[[INNER_HEADER]], label %[[OUTER_LATCH]]
+; CHECK:       [[OUTER_LATCH]]:
+; CHECK-NEXT:    [[OUTER_NEXT]] = add i32 [[OUTER_I]], 1
+; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV]], i20 [[TMP0]]
+; CHECK-NEXT:    br label %[[OUTER_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %stride20 = trunc i32 %stride to i20
+  br label %outer_header
+
+outer_header:
+  %outer_i = phi i32 [ 0, %entry ], [ %outer_next, %outer_latch ]
+  %outer_cmp = icmp slt i32 %outer_i, %outer_n
+  br i1 %outer_cmp, label %inner_preheader, label %exit
+
+inner_preheader:
+  ; Compute base offset for this outer iteration
+  %outer_offset_32 = mul i32 %outer_i, %stride
+  %outer_offset = trunc i32 %outer_offset_32 to i20
+  %outer_ptr = getelementptr i8, ptr %base, i20 %outer_offset
+  br label %inner_header
+
+inner_header:
+  %inner_i = phi i32 [ 0, %inner_preheader ], [ %inner_next, %inner_header ]
+  ; This GEP is inside inner loop - should NOT create pointer PHI in outer_header
+  %inner_offset_32 = mul i32 %inner_i, 64
+  %inner_offset = trunc i32 %inner_offset_32 to i20
+  %inner_ptr = getelementptr i8, ptr %outer_ptr, i20 %inner_offset
+
+  %val = load <32 x i16>, ptr %inner_ptr, align 64
+  call void @consume(<32 x i16> %val)
+
+  %inner_next = add i32 %inner_i, 1
+  %inner_cmp = icmp slt i32 %inner_next, %inner_n
+  br i1 %inner_cmp, label %inner_header, label %outer_latch
+
+outer_latch:
+  ; Only scalar add should happen here, NOT pointer updates
+  %outer_next = add i32 %outer_i, 1
+  br label %outer_header
+
+exit:
+  ret void
+}
+
+; Test: Multiple arrays in nested loop - even worse regression
+; Each array creates its own pointer PHI, multiplying outer latch cost
+;
+; Outer loop should have scalar i20 PHI, not pointer PHIs
+; Inner loop should use scalar i20 recurrence
+; Outer latch should have scalar add, not multiple pointer scevgeps
+define void @nested_loop_multiple_arrays(ptr %a, ptr %b, ptr %c, i32 %outer_n, i32 %inner_n, i32 %stride) {
+; CHECK-LABEL: define void @nested_loop_multiple_arrays(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[OUTER_N:%.*]], i32 [[INNER_N:%.*]], i32 [[STRIDE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[STRIDE20:%.*]] = trunc i32 [[STRIDE]] to i20
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[STRIDE]] to i20
+; CHECK-NEXT:    br label %[[OUTER_HEADER:.*]]
+; CHECK:       [[OUTER_HEADER]]:
+; CHECK-NEXT:    [[LSR_IV7:%.*]] = phi ptr [ [[SCEVGEP8:%.*]], %[[OUTER_LATCH:.*]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[LSR_IV3:%.*]] = phi ptr [ [[SCEVGEP4:%.*]], %[[OUTER_LATCH]] ], [ [[B]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[OUTER_LATCH]] ], [ [[C]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[OUTER_I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[OUTER_NEXT:%.*]], %[[OUTER_LATCH]] ]
+; CHECK-NEXT:    [[OUTER_CMP:%.*]] = icmp slt i32 [[OUTER_I]], [[OUTER_N]]
+; CHECK-NEXT:    br i1 [[OUTER_CMP]], label %[[INNER_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK:       [[INNER_PREHEADER]]:
+; CHECK-NEXT:    br label %[[INNER_HEADER:.*]]
+; CHECK:       [[INNER_HEADER]]:
+; CHECK-NEXT:    [[INNER_A:%.*]] = phi ptr [ [[SCEVGEP10:%.*]], %[[INNER_HEADER]] ], [ [[LSR_IV7]], %[[INNER_PREHEADER]] ]
+; CHECK-NEXT:    [[INNER_B:%.*]] = phi ptr [ [[SCEVGEP6:%.*]], %[[INNER_HEADER]] ], [ [[LSR_IV3]], %[[INNER_PREHEADER]] ]
+; CHECK-NEXT:    [[INNER_C:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], %[[INNER_HEADER]] ], [ [[LSR_IV]], %[[INNER_PREHEADER]] ]
+; CHECK-NEXT:    [[INNER_I:%.*]] = phi i32 [ 0, %[[INNER_PREHEADER]] ], [ [[INNER_NEXT:%.*]], %[[INNER_HEADER]] ]
+; CHECK-NEXT:    [[VAL_A:%.*]] = load <32 x i16>, ptr [[INNER_A]], align 64
+; CHECK-NEXT:    [[VAL_B:%.*]] = load <32 x i16>, ptr [[INNER_B]], align 64
+; CHECK-NEXT:    [[SUM:%.*]] = add <32 x i16> [[VAL_A]], [[VAL_B]]
+; CHECK-NEXT:    store <32 x i16> [[SUM]], ptr [[INNER_C]], align 64
+; CHECK-NEXT:    [[INNER_NEXT]] = add i32 [[INNER_I]], 1
+; CHECK-NEXT:    [[SCEVGEP2]] = getelementptr i8, ptr [[INNER_C]], i20 64
+; CHECK-NEXT:    [[SCEVGEP6]] = getelementptr i8, ptr [[INNER_B]], i20 64
+; CHECK-NEXT:    [[SCEVGEP10]] = getelementptr i8, ptr [[INNER_A]], i20 64
+; CHECK-NEXT:    [[INNER_CMP:%.*]] = icmp slt i32 [[INNER_NEXT]], [[INNER_N]]
+; CHECK-NEXT:    br i1 [[INNER_CMP]], label %[[INNER_HEADER]], label %[[OUTER_LATCH]]
+; CHECK:       [[OUTER_LATCH]]:
+; CHECK-NEXT:    [[OUTER_NEXT]] = add i32 [[OUTER_I]], 1
+; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV]], i20 [[TMP0]]
+; CHECK-NEXT:    [[SCEVGEP4]] = getelementptr i8, ptr [[LSR_IV3]], i20 [[TMP0]]
+; CHECK-NEXT:    [[SCEVGEP8]] = getelementptr i8, ptr [[LSR_IV7]], i20 [[TMP0]]
+; CHECK-NEXT:    br label %[[OUTER_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %stride20 = trunc i32 %stride to i20
+  br label %outer_header
+
+outer_header:
+  %outer_i = phi i32 [ 0, %entry ], [ %outer_next, %outer_latch ]
+  %outer_cmp = icmp slt i32 %outer_i, %outer_n
+  br i1 %outer_cmp, label %inner_preheader, label %exit
+
+inner_preheader:
+  %outer_offset_32 = mul i32 %outer_i, %stride
+  %outer_offset = trunc i32 %outer_offset_32 to i20
+  %ptr_a = getelementptr i8, ptr %a, i20 %outer_offset
+  %ptr_b = getelementptr i8, ptr %b, i20 %outer_offset
+  %ptr_c = getelementptr i8, ptr %c, i20 %outer_offset
+  br label %inner_header
+
+inner_header:
+  %inner_i = phi i32 [ 0, %inner_preheader ], [ %inner_next, %inner_header ]
+  %inner_offset_32 = mul i32 %inner_i, 64
+  %inner_offset = trunc i32 %inner_offset_32 to i20
+
+  ; Three GEPs - should NOT create 3 pointer PHIs in outer_header
+  %inner_a = getelementptr i8, ptr %ptr_a, i20 %inner_offset
+  %inner_b = getelementptr i8, ptr %ptr_b, i20 %inner_offset
+  %inner_c = getelementptr i8, ptr %ptr_c, i20 %inner_offset
+
+  %val_a = load <32 x i16>, ptr %inner_a, align 64
+  %val_b = load <32 x i16>, ptr %inner_b, align 64
+  %sum = add <32 x i16> %val_a, %val_b
+  store <32 x i16> %sum, ptr %inner_c, align 64
+
+  %inner_next = add i32 %inner_i, 1
+  %inner_cmp = icmp slt i32 %inner_next, %inner_n
+  br i1 %inner_cmp, label %inner_header, label %outer_latch
+
+outer_latch:
+  %outer_next = add i32 %outer_i, 1
+  br label %outer_header
+
+exit:
+  ret void
+}
+
+declare void @consume(<32 x i16>)
diff --git a/llvm/test/CodeGen/AIE/opt/lsr-preserve-pointer-recurrence.ll b/llvm/test/CodeGen/AIE/opt/lsr-preserve-pointer-recurrence.ll
new file mode 100644
index 000000000000..c907155256a7
--- /dev/null
+++ b/llvm/test/CodeGen/AIE/opt/lsr-preserve-pointer-recurrence.ll
@@ -0,0 +1,171 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+;
+; This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+; See https://llvm.org/LICENSE.txt for license information.
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+;
+; (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates
+
+; RUN: opt -mtriple=aie2p -passes=loop-reduce -S %s | FileCheck %s
+
+; This test verifies that LSR preserves pointer recurrences on AIE targets.
+; AIE processors support post-increment addressing modes (VLD_pstm, VST_pstm)
+; that fold pointer updates into memory operations for free. LSR should NOT
+; rewrite pointer PHIs to scalar offset + base formulas, as this would
+; prevent post-increment combining and introduce extra PADD instructions.
+;
+; Specifically, this test checks that:
+; 1. Pointer PHIs are preserved (not rewritten to %scevgep or similar)
+; 2. GEP chains retain their original structure with inbounds
+; 3. addrspacecast operations don't trigger unwanted IV chain processing
+
+target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
+target triple = "aie2p"
+
+; Test: Multiple pointer recurrences with variable stride through addrspacecast
+; The pointer PHIs should be preserved as-is, not rewritten by LSR.
+;
+define void @multi_pointer_addrspacecast(ptr %ifm, ptr %ofm, i20 %stride, i32 %n) {
+; CHECK-LABEL: define void @multi_pointer_addrspacecast(
+; CHECK-SAME: ptr [[IFM:%.*]], ptr [[OFM:%.*]], i20 [[STRIDE:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[P_IFM:%.*]] = phi ptr [ [[IFM]], %[[ENTRY]] ], [ [[NEXT_IFM:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[P_OFM:%.*]] = phi ptr [ [[OFM]], %[[ENTRY]] ], [ [[NEXT_OFM:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IFM_AS:%.*]] = addrspacecast ptr [[P_IFM]] to ptr addrspace(5)
+; CHECK-NEXT:    [[OFM_AS:%.*]] = addrspacecast ptr [[P_OFM]] to ptr addrspace(7)
+; CHECK-NEXT:    [[VAL:%.*]] = load <16 x i32>, ptr addrspace(5) [[IFM_AS]], align 64
+; CHECK-NEXT:    store <16 x i32> [[VAL]], ptr addrspace(7) [[OFM_AS]], align 64
+; CHECK-NEXT:    [[NEXT_IFM]] = getelementptr inbounds i8, ptr [[P_IFM]], i20 [[STRIDE]]
+; CHECK-NEXT:    [[NEXT_OFM]] = getelementptr inbounds i8, ptr [[P_OFM]], i20 [[STRIDE]]
+; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %p_ifm = phi ptr [ %ifm, %entry ], [ %next_ifm, %loop ]
+  %p_ofm = phi ptr [ %ofm, %entry ], [ %next_ofm, %loop ]
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+
+  %ifm_as = addrspacecast ptr %p_ifm to ptr addrspace(5)
+  %ofm_as = addrspacecast ptr %p_ofm to ptr addrspace(7)
+
+  %val = load <16 x i32>, ptr addrspace(5) %ifm_as, align 64
+  store <16 x i32> %val, ptr addrspace(7) %ofm_as, align 64
+
+  %next_ifm = getelementptr inbounds i8, ptr %p_ifm, i20 %stride
+  %next_ofm = getelementptr inbounds i8, ptr %p_ofm, i20 %stride
+
+  %i.next = add i32 %i, 1
+  %cond = icmp slt i32 %i.next, %n
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Test: GEP chain within a loop (multiple loads at offsets)
+; This pattern can form IV chains in LSR. LSR should NOT rewrite these.
+;
+define void @gep_chain_pattern(ptr %base, i20 %stride, i32 %n) {
+; CHECK-LABEL: define void @gep_chain_pattern(
+; CHECK-SAME: ptr [[BASE:%.*]], i20 [[STRIDE:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[P:%.*]] = phi ptr [ [[BASE]], %[[ENTRY]] ], [ [[P3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[AS0:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(5)
+; CHECK-NEXT:    [[V0:%.*]] = load <16 x i32>, ptr addrspace(5) [[AS0]], align 64
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i20 [[STRIDE]]
+; CHECK-NEXT:    [[AS1:%.*]] = addrspacecast ptr [[P1]] to ptr addrspace(5)
+; CHECK-NEXT:    [[V1:%.*]] = load <16 x i32>, ptr addrspace(5) [[AS1]], align 64
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i20 [[STRIDE]]
+; CHECK-NEXT:    [[AS2:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(5)
+; CHECK-NEXT:    [[V2:%.*]] = load <16 x i32>, ptr addrspace(5) [[AS2]], align 64
+; CHECK-NEXT:    [[P3]] = getelementptr inbounds i8, ptr [[P2]], i20 [[STRIDE]]
+; CHECK-NEXT:    call void @consume(<16 x i32> [[V0]])
+; CHECK-NEXT:    call void @consume(<16 x i32> [[V1]])
+; CHECK-NEXT:    call void @consume(<16 x i32> [[V2]])
+; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %p = phi ptr [ %base, %entry ], [ %p3, %loop ]
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+
+  %as0 = addrspacecast ptr %p to ptr addrspace(5)
+  %v0 = load <16 x i32>, ptr addrspace(5) %as0, align 64
+
+  %p1 = getelementptr inbounds i8, ptr %p, i20 %stride
+  %as1 = addrspacecast ptr %p1 to ptr addrspace(5)
+  %v1 = load <16 x i32>, ptr addrspace(5) %as1, align 64
+
+  %p2 = getelementptr inbounds i8, ptr %p1, i20 %stride
+  %as2 = addrspacecast ptr %p2 to ptr addrspace(5)
+  %v2 = load <16 x i32>, ptr addrspace(5) %as2, align 64
+
+  %p3 = getelementptr inbounds i8, ptr %p2, i20 %stride
+
+  call void @consume(<16 x i32> %v0)
+  call void @consume(<16 x i32> %v1)
+  call void @consume(<16 x i32> %v2)
+
+  %i.next = add i32 %i, 1
+  %cond = icmp slt i32 %i.next, %n
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+declare void @consume(<16 x i32>)
+
+; Test: i20 scalar recurrences should still be handled by LSR
+; This ensures the IVUsers change doesn't break i20 integer optimization
+;
+define i20 @i20_scalar_recurrence(i20 %n, i20 %step) {
+; CHECK-LABEL: define i20 @i20_scalar_recurrence(
+; CHECK-SAME: i20 [[N:%.*]], i20 [[STEP:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i20 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i20 [ 0, %[[ENTRY]] ], [ [[SUM_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[SUM_NEXT]] = add i20 [[SUM]], [[I]]
+; CHECK-NEXT:    [[I_NEXT]] = add i20 [[I]], [[STEP]]
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i20 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i20 [ [[SUM_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i20 [[SUM_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i20 [ 0, %entry ], [ %i.next, %loop ]
+  %sum = phi i20 [ 0, %entry ], [ %sum.next, %loop ]
+
+  %sum.next = add i20 %sum, %i
+  %i.next = add i20 %i, %step
+
+  %cond = icmp slt i20 %i.next, %n
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret i20 %sum.next
+}