diff --git a/llvm/include/llvm/Analysis/IVUsers.h b/llvm/include/llvm/Analysis/IVUsers.h index 2af3e389446c..8ff708b3a8ef 100644 --- a/llvm/include/llvm/Analysis/IVUsers.h +++ b/llvm/include/llvm/Analysis/IVUsers.h @@ -4,6 +4,9 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // +// Modifications (c) Copyright 2026 Advanced Micro Devices, Inc. or its +// affiliates +// //===----------------------------------------------------------------------===// // // This file implements bookkeeping for "interesting" users of expressions @@ -26,6 +29,7 @@ class AssumptionCache; class DominatorTree; class ScalarEvolution; class SCEV; +class TargetTransformInfo; class IVUsers; /// IVStrideUse - Keep track of one use of a strided induction variable. @@ -95,6 +99,7 @@ class IVUsers { LoopInfo *LI; DominatorTree *DT; ScalarEvolution *SE; + const TargetTransformInfo *TTI; SmallPtrSet Processed; /// IVUses - A list of all tracked IV uses of induction variable expressions @@ -106,12 +111,13 @@ class IVUsers { public: IVUsers(Loop *L, AssumptionCache *AC, LoopInfo *LI, DominatorTree *DT, - ScalarEvolution *SE); + ScalarEvolution *SE, const TargetTransformInfo *TTI = nullptr); IVUsers(IVUsers &&X) : L(std::move(X.L)), AC(std::move(X.AC)), DT(std::move(X.DT)), - SE(std::move(X.SE)), Processed(std::move(X.Processed)), - IVUses(std::move(X.IVUses)), EphValues(std::move(X.EphValues)) { + SE(std::move(X.SE)), TTI(std::move(X.TTI)), + Processed(std::move(X.Processed)), IVUses(std::move(X.IVUses)), + EphValues(std::move(X.EphValues)) { for (IVStrideUse &U : IVUses) U.Parent = this; } @@ -121,10 +127,11 @@ class IVUsers { Loop *getLoop() const { return L; } - /// AddUsersIfInteresting - Inspect the specified Instruction. If it is a - /// reducible SCEV, recursively add its users to the IVUsesByStride set and - /// return true. Otherwise, return false. - bool AddUsersIfInteresting(Instruction *I); + /// Inspect the specified Instruction. If it is a reducible SCEV, recursively + /// add its users to the IVUsesByStride set and return true. Otherwise, + /// return false. If \p BypassWidthCheck is true, skip the type width + /// validation (used when the caller has already verified the type via TTI). + bool AddUsersIfInteresting(Instruction *I, bool BypassWidthCheck = false); IVStrideUse &AddUser(Instruction *User, Value *Operand); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 8b69a8c16287..8c6d1b7e5875 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -4,7 +4,7 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// Modifications (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its +// Modifications (c) Copyright 2023-2026 Advanced Micro Devices, Inc. or its // affiliates // //===----------------------------------------------------------------------===// @@ -66,6 +66,8 @@ class SmallBitVector; class StoreInst; class SwitchInst; class TargetLibraryInfo; +class TruncInst; +class GetElementPtrInst; class Type; class VPIntrinsic; struct KnownBits; @@ -798,6 +800,22 @@ class TargetTransformInfo { AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const; + /// Return true if IVUsers() should look through the instruction to collect + /// its users instead. If true, populates GEPsToProcess with the GEP + /// instructions to process as IV users. + /// This is useful for targets where pointer and integer bit sizes differ + /// (e.g., 20-bit pointers with 32-bit integers), causing truncs to index + /// size that feed GEP indices. + bool shouldIVUsersLookThroughInst( + Instruction *I, + SmallVectorImpl &GEPsToProcess) const; + + /// Return true if the given type is valid for IV user collection. + /// By default, only legal integer widths up to 64 bits are allowed. + /// Targets where pointer and integer bit sizes differ may override this + /// to allow index-sized integers or pointers. + bool isValidIVUserType(Type *Ty) const; + /// Return true if the target supports masked store. bool isLegalMaskedStore(Type *DataType, Align Alignment) const; /// Return true if the target supports masked load. @@ -2009,6 +2027,10 @@ class TargetTransformInfo::Concept { TargetLibraryInfo *LibInfo) = 0; virtual AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const = 0; + virtual bool shouldIVUsersLookThroughInst( + Instruction *I, + SmallVectorImpl &GEPsToProcess) const = 0; + virtual bool isValidIVUserType(Type *Ty) const = 0; virtual bool isLegalMaskedStore(Type *DataType, Align Alignment) = 0; virtual bool isLegalMaskedLoad(Type *DataType, Align Alignment) = 0; virtual bool isLegalNTStore(Type *DataType, Align Alignment) = 0; @@ -2553,6 +2575,14 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { ScalarEvolution *SE) const override { return Impl.getPreferredAddressingMode(L, SE); } + bool shouldIVUsersLookThroughInst( + Instruction *I, + SmallVectorImpl &GEPsToProcess) const override { + return Impl.shouldIVUsersLookThroughInst(I, GEPsToProcess); + } + bool isValidIVUserType(Type *Ty) const override { + return Impl.isValidIVUserType(Ty); + } bool isLegalMaskedStore(Type *DataType, Align Alignment) override { return Impl.isLegalMaskedStore(DataType, Alignment); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 5d3e83ed537e..74c14a0e610d 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -4,7 +4,7 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// Modifications (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its +// Modifications (c) Copyright 2023-2026 Advanced Micro Devices, Inc. or its // affiliates // //===----------------------------------------------------------------------===// @@ -283,6 +283,22 @@ class TargetTransformInfoImplBase { return TTI::AMK_None; } + /// By default, do not look through instructions in IVUsers. + bool shouldIVUsersLookThroughInst( + Instruction *I, + SmallVectorImpl &GEPsToProcess) const { + return false; + } + + /// By default, only legal integer widths up to 64 bits are valid for IV + /// users. + bool isValidIVUserType(Type *Ty) const { + if (!Ty->isIntegerTy() && !Ty->isPointerTy()) + return false; + const unsigned Width = DL.getTypeSizeInBits(Ty); + return Width <= 64 && DL.isLegalInteger(Width); + } + bool isLegalMaskedStore(Type *DataType, Align Alignment) const { return false; } diff --git a/llvm/lib/Analysis/IVUsers.cpp b/llvm/lib/Analysis/IVUsers.cpp index 0880701d8308..1da358753295 100644 --- a/llvm/lib/Analysis/IVUsers.cpp +++ b/llvm/lib/Analysis/IVUsers.cpp @@ -4,6 +4,9 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // +// Modifications (c) Copyright 2026 Advanced Micro Devices, Inc. or its +// affiliates +// //===----------------------------------------------------------------------===// // // This file implements bookkeeping for "interesting" users of expressions @@ -18,6 +21,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/DataLayout.h" @@ -35,7 +39,7 @@ AnalysisKey IVUsersAnalysis::Key; IVUsers IVUsersAnalysis::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR) { - return IVUsers(&L, &AR.AC, &AR.LI, &AR.DT, &AR.SE); + return IVUsers(&L, &AR.AC, &AR.LI, &AR.DT, &AR.SE, &AR.TTI); } char IVUsersWrapperPass::ID = 0; @@ -133,7 +137,7 @@ static bool IVUseShouldUsePostIncValue(Instruction *User, Value *Operand, /// Inspect the specified instruction. If it is a reducible SCEV, recursively /// add its users to the IVUsesByStride set and return true. Otherwise, return /// false. -bool IVUsers::AddUsersIfInteresting(Instruction *I) { +bool IVUsers::AddUsersIfInteresting(Instruction *I, bool BypassWidthCheck) { const DataLayout &DL = I->getDataLayout(); // Add this IV user to the Processed set before returning false to ensure that @@ -153,9 +157,16 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) { // LSR is not APInt clean, do not touch integers bigger than 64-bits. // Also avoid creating IVs of non-native types. For example, we don't want a // 64-bit IV in 32-bit code just because the loop has one 64-bit cast. - uint64_t Width = SE->getTypeSizeInBits(I->getType()); - if (Width > 64 || !DL.isLegalInteger(Width)) - return false; + // Use TTI hook if available to allow targets where pointer and integer bit + // sizes differ (e.g., 20-bit pointers with 32-bit integers) to enable IV + // user collection for index-sized types. + if (!BypassWidthCheck) { + const uint64_t Width = SE->getTypeSizeInBits(I->getType()); + const bool IsValidType = TTI ? TTI->isValidIVUserType(I->getType()) + : (Width <= 64 && DL.isLegalInteger(Width)); + if (!IsValidType) + return false; + } // Don't attempt to promote ephemeral values to indvars. They will be removed // later anyway. @@ -170,6 +181,18 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) { if (!isInteresting(ISE, I, L, SE, LI)) return false; + // Allow targets to look through certain instructions (e.g., truncs to index + // size on targets where pointer and integer bit sizes differ) to collect + // their users instead. This enables LSR to create pointer PHIs. + SmallVector GEPsToProcess; + if (TTI && TTI->shouldIVUsersLookThroughInst(I, GEPsToProcess)) { + LLVM_DEBUG(dbgs() << "Looking through instruction: " << *I << '\n'); + bool AnyInteresting = false; + for (GetElementPtrInst *GEP : GEPsToProcess) + AnyInteresting |= AddUsersIfInteresting(GEP, /*BypassWidthCheck=*/true); + return AnyInteresting; + } + SmallPtrSet UniqueUsers; for (Use &U : I->uses()) { Instruction *User = cast(U.getUser()); @@ -249,8 +272,8 @@ IVStrideUse &IVUsers::AddUser(Instruction *User, Value *Operand) { } IVUsers::IVUsers(Loop *L, AssumptionCache *AC, LoopInfo *LI, DominatorTree *DT, - ScalarEvolution *SE) - : L(L), AC(AC), LI(LI), DT(DT), SE(SE) { + ScalarEvolution *SE, const TargetTransformInfo *TTI) + : L(L), AC(AC), LI(LI), DT(DT), SE(SE), TTI(TTI) { // Collect ephemeral values so that AddUsersIfInteresting skips them. EphValues.clear(); CodeMetrics::collectEphemeralValues(L, AC, EphValues); @@ -306,6 +329,7 @@ void IVUsersWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.setPreservesAll(); } @@ -315,8 +339,10 @@ bool IVUsersWrapperPass::runOnLoop(Loop *L, LPPassManager &LPM) { auto *LI = &getAnalysis().getLoopInfo(); auto *DT = &getAnalysis().getDomTree(); auto *SE = &getAnalysis().getSE(); + auto *TTI = &getAnalysis().getTTI( + *L->getHeader()->getParent()); - IU.reset(new IVUsers(L, AC, LI, DT, SE)); + IU.reset(new IVUsers(L, AC, LI, DT, SE, TTI)); return false; } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 7f770fd1efce..e44a493b9298 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -4,7 +4,7 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// Modifications (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its +// Modifications (c) Copyright 2023-2026 Advanced Micro Devices, Inc. or its // affiliates // //===----------------------------------------------------------------------===// @@ -474,6 +474,15 @@ TargetTransformInfo::getPreferredAddressingMode(const Loop *L, return TTIImpl->getPreferredAddressingMode(L, SE); } +bool TargetTransformInfo::shouldIVUsersLookThroughInst( + Instruction *I, SmallVectorImpl &GEPsToProcess) const { + return TTIImpl->shouldIVUsersLookThroughInst(I, GEPsToProcess); +} + +bool TargetTransformInfo::isValidIVUserType(Type *Ty) const { + return TTIImpl->isValidIVUserType(Ty); +} + bool TargetTransformInfo::isLegalMaskedStore(Type *DataType, Align Alignment) const { return TTIImpl->isLegalMaskedStore(DataType, Alignment); diff --git a/llvm/lib/Target/AIE/AIEBaseTargetTransformInfo.h b/llvm/lib/Target/AIE/AIEBaseTargetTransformInfo.h index 4bad608ecae9..b37b38985074 100644 --- a/llvm/lib/Target/AIE/AIEBaseTargetTransformInfo.h +++ b/llvm/lib/Target/AIE/AIEBaseTargetTransformInfo.h @@ -4,7 +4,7 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates +// (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates // //===----------------------------------------------------------------------===// // @@ -64,6 +64,10 @@ template class AIEBaseTTIImpl : public BasicTTIImplBase { virtual ~AIEBaseTTIImpl() = default; public: + //===--------------------------------------------------------------------===// + // Cost Model + //===--------------------------------------------------------------------===// + int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) { // TODO Handle Target Specific constant cost // Larger constants require an add. @@ -76,6 +80,11 @@ template class AIEBaseTTIImpl : public BasicTTIImplBase { // cost? return TTI::TCC_Basic; } + + //===--------------------------------------------------------------------===// + // Loop Optimization + //===--------------------------------------------------------------------===// + void adjustUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE); @@ -83,6 +92,10 @@ template class AIEBaseTTIImpl : public BasicTTIImplBase { AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo); + //===--------------------------------------------------------------------===// + // Vectorization + //===--------------------------------------------------------------------===// + // We define a store vector factor of 4 for 8-bit and 2 for 16-bit. This // allows combining 2 16-bit stores or 4 8-bit stores into a single 32-bit // vector store. This is deemed beneficial because of the LMS nature of @@ -114,6 +127,93 @@ template class AIEBaseTTIImpl : public BasicTTIImplBase { // Default return of allowsMisalignedMemoryAccesses is false. return ChainSizeInBytes >= 4; } + + //===--------------------------------------------------------------------===// + // Loop Strength Reduction (LSR) + // + // AIE has 20-bit pointers but 32-bit integers, and post-increment load/store + // instructions (VLD_pstm, VST_pstm). These hooks enable LSR to generate + // pointer recurrences that the backend can combine with post-increment ops. + //===--------------------------------------------------------------------===// + + /// Check if type is an integer matching the target's index size (e.g., i20). + /// Note: uses address space 0; all AIE address spaces share the same index + /// width. + static bool isIndexSizedInteger(Type *Ty, const DataLayout &DL) { + return Ty->isIntegerTy() && + Ty->getIntegerBitWidth() == DL.getIndexSizeInBits(0); + } + + /// Collect all GEP users of \p Trunc that use it as an index operand (not + /// the pointer operand). Returns false if any use is not a GEP index. + static bool collectGEPIndices(const TruncInst *Trunc, + SmallVectorImpl &GEPs) { + for (const Use &U : Trunc->uses()) { + auto *GEP = dyn_cast(U.getUser()); + if (!GEP || U.getOperandNo() == 0) + return false; + GEPs.push_back(GEP); + } + return true; + } + + /// Prefer pointer-based recurrences over scalar offset + base formulations. + TTI::AddressingModeKind + getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const { + return TTI::AMK_PostIndexed; + } + + /// Enable post-increment addressing for index-sized integers (i20). + bool isIndexedLoadLegal(TTI::MemIndexedMode Mode, Type *Ty, + const DataLayout &DL) const { + return Mode == TTI::MIM_PostInc && isIndexSizedInteger(Ty, DL); + } + + bool isIndexedStoreLegal(TTI::MemIndexedMode Mode, Type *Ty, + const DataLayout &DL) const { + return Mode == TTI::MIM_PostInc && isIndexSizedInteger(Ty, DL); + } + + /// Look through truncs to index size that feed GEP indices. + /// + /// Array indexing generates: %trunc = trunc i32 %idx to i20 + /// Without this hook, IVUsers() stops at the trunc (i20 not legal). + /// With this hook, IVUsers() continues to the GEP, collecting pointer SCEVs. + bool shouldIVUsersLookThroughInst( + Instruction *I, + SmallVectorImpl &GEPsToProcess) const { + auto *Trunc = dyn_cast(I); + if (!Trunc) + return false; + + if (!Trunc->getType()->isIntegerTy()) + return false; + + const DataLayout &DL = Trunc->getModule()->getDataLayout(); + const unsigned TruncWidth = Trunc->getType()->getIntegerBitWidth(); + // All AIE address spaces share the same index width; use address space 0. + const unsigned IndexWidth = DL.getIndexSizeInBits(/*AS=*/0); + + if (TruncWidth != IndexWidth || DL.isLegalInteger(TruncWidth)) + return false; + + return collectGEPIndices(Trunc, GEPsToProcess); + } + + /// Prioritize fewer loop-body adds over fewer recurrences. + /// For VLIW, extra adds hurt II while extra PHIs execute in parallel. + bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const { + return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost, + C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) < + std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost, + C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost); + } + + /// Extend valid IV user types to include index-sized integers (i20). + bool isValidIVUserType(Type *Ty) const { + return BaseT::isValidIVUserType(Ty) || + isIndexSizedInteger(Ty, BaseT::getDataLayout()); + } }; } // end namespace llvm diff --git a/llvm/lib/Target/AIE/AIEClusterBaseAddress.cpp b/llvm/lib/Target/AIE/AIEClusterBaseAddress.cpp index b5e0f70b0e74..4abc4ac49765 100644 --- a/llvm/lib/Target/AIE/AIEClusterBaseAddress.cpp +++ b/llvm/lib/Target/AIE/AIEClusterBaseAddress.cpp @@ -4,7 +4,7 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates +// (c) Copyright 2023-2026 Advanced Micro Devices, Inc. or its affiliates // //===----------------------------------------------------------------------===// // @@ -198,6 +198,13 @@ class AIEClusterBaseAddress : public MachineFunctionPass { std::optional OffsetA, std::optional OffsetB); + // Insert G_PTR_ADD with zero offset for load/store instructions that + // directly use a pointer register which also has G_PTR_ADD users. + // This ensures bare offset-0 accesses participate in post-increment + // chaining built by buildChain. + bool insertPtrAddForBareMemOps(MachineBasicBlock &MBB, MachineIRBuilder &MIB, + GISelObserverWrapper &Observer); + // Return true if the instructions are used by both loads and stores. bool hasMixedLoadStoreUse(SmallVector Instrs); @@ -263,6 +270,8 @@ bool AIEClusterBaseAddress::processBasicBlock(MachineBasicBlock &MBB, bool Changed = false; + Changed |= insertPtrAddForBareMemOps(MBB, MIB, Observer); + // Get all G_PTR_ADDs that use the same pointer. RegUseMap RegAndUses = collectPtrUses(MBB); @@ -291,6 +300,86 @@ bool AIEClusterBaseAddress::processBasicBlock(MachineBasicBlock &MBB, return Changed; } +bool AIEClusterBaseAddress::insertPtrAddForBareMemOps( + MachineBasicBlock &MBB, MachineIRBuilder &MIB, + GISelObserverWrapper &Observer) { + bool Changed = false; + + for (MachineInstr &PHI : MBB) { + if (!PHI.isPHI()) + return Changed; + + const Register PhiReg = PHI.getOperand(0).getReg(); + const bool IsPointerPhi = MRI->getType(PhiReg).isPointer(); + if (!IsPointerPhi) + continue; + + // Single walk over the MBB in program order to collect: + // - PtrAddFeedingMemOpCount: G_PTR_ADDs from this PHI that feed mem ops + // - BareMemOps: load/store instructions that use the PHI directly + // - Whether each bare mem op precedes the first such G_PTR_ADD + unsigned PtrAddFeedingMemOpCount = 0; + SmallVector, 4> BareMemOps; + + for (MachineInstr &MI : MBB) { + if (MI.isPHI()) + continue; + + const bool UsesPhiAsBase = MI.getNumOperands() > 1 && + MI.getOperand(1).isReg() && + MI.getOperand(1).getReg() == PhiReg; + if (!UsesPhiAsBase) + continue; + + const bool IsPtrAdd = MI.getOpcode() == TargetOpcode::G_PTR_ADD; + if (IsPtrAdd) { + const bool FeedsMemOp = + any_of(MRI->use_nodbg_instructions(MI.getOperand(0).getReg()), + [](const MachineInstr &U) { return U.mayLoadOrStore(); }); + if (FeedsMemOp) + PtrAddFeedingMemOpCount++; + continue; + } + + const bool IsBareMemOp = + MI.mayLoadOrStore() && MI.getNumMemOperands() > 0; + if (IsBareMemOp) { + const bool PrecedesChain = PtrAddFeedingMemOpCount == 0; + BareMemOps.push_back({&MI, PrecedesChain}); + } + } + + // We need at least 2 PTR_ADDs feeding memory ops to form a chain. + // With only 1, the inserted PTR_ADD +0 has no chain partner and + // survives as a redundant padda #0. + if (PtrAddFeedingMemOpCount < 2 || BareMemOps.empty()) + continue; + + for (auto &[MemOp, PrecedesChain] : BareMemOps) { + // Skip insertion if the bare mem op precedes all G_PTR_ADDs from + // this PHI that feed memory ops. Such a mem op is the "first" user + // of the PHI pointer and can be combined with a post-increment + // update (e.g., add.2d/add.3d) into a single post-increment load. + // Inserting G_PTR_ADD +0 would break this combination. + if (PrecedesChain) + continue; + + const unsigned AddrIdx = 1; + MIB.setInsertPt(MBB, MemOp->getIterator()); + const auto ZeroOffset = MIB.buildConstant(LLT::scalar(20), 0); + const auto NewPtr = + MIB.buildInstr(TargetOpcode::G_PTR_ADD, {MRI->getType(PhiReg)}, + {PhiReg, ZeroOffset.getReg(0)}); + Observer.changingInstr(*MemOp); + MemOp->getOperand(AddrIdx).setReg(NewPtr.getReg(0)); + Observer.changedInstr(*MemOp); + Changed = true; + } + } + + return Changed; +} + /// Recursively search bottom up for Load instrs in the use chain of \p MI . /// Stop the search when Exiting \p MBB . Return all found Load MachineInstr in /// \p LoadsFeedingInstrs . diff --git a/llvm/test/CodeGen/AIE/GlobalISel/cluster-base-address-bare-memop-e2e.mir b/llvm/test/CodeGen/AIE/GlobalISel/cluster-base-address-bare-memop-e2e.mir new file mode 100644 index 000000000000..fcbe4e1f0d0c --- /dev/null +++ b/llvm/test/CodeGen/AIE/GlobalISel/cluster-base-address-bare-memop-e2e.mir @@ -0,0 +1,147 @@ +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates +# +# End-to-end test: run from legalizer through custom combiner to verify that +# PTR_ADD +0 inserted by AIEClusterBaseAddress gets fully absorbed into +# post-increment chains with no surviving padda #0. +# +# RUN: llc -mtriple aie2 -start-after=legalizer \ +# RUN: -stop-after=aie2-postlegalizer-custom-combiner \ +# RUN: %s -verify-machineinstrs -o - | FileCheck %s + +# Bare load at offset 0 is in the MIDDLE of the access pattern (-128, 0, +128). +# The PTR_ADD +0 should be chained and combined into post-increment loads. +# No G_PTR_ADD with i20 0 should remain. +--- +name: bare_load_middle +legalized: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: bare_load_middle + ; CHECK-NOT: G_PTR_ADD {{.*}}, {{.*}}i20 0 + bb.0: + successors: %bb.1 + liveins: $p0, $r0 + %0:_(p0) = COPY $p0 + %1:_(s32) = COPY $r0 + %100:_(s20) = G_CONSTANT i20 256 + %2:_(p0) = G_PTR_ADD %0, %100(s20) + G_BR %bb.1 + + bb.1: + successors: %bb.2, %bb.1 + %3:_(p0) = G_PHI %10(p0), %bb.1, %2(p0), %bb.0 + %13:_(s32) = G_PHI %15(s32), %bb.1, %1(s32), %bb.0 + %4:_(s20) = G_CONSTANT i20 -128 + %5:_(p0) = G_PTR_ADD %3, %4(s20) + %6:_(<16 x s32>) = G_LOAD %5(p0) :: (load (<16 x s32>), align 64) + %7:_(<16 x s32>) = G_LOAD %3(p0) :: (load (<16 x s32>), align 64) + %8:_(s20) = G_CONSTANT i20 128 + %9:_(p0) = G_PTR_ADD %3, %8(s20) + %11:_(<16 x s32>) = G_LOAD %9(p0) :: (load (<16 x s32>), align 64) + %12:_(s20) = G_CONSTANT i20 512 + %10:_(p0) = G_PTR_ADD %3, %12(s20) + %14:_(s32) = G_CONSTANT i32 -1 + %15:_(s32) = G_ADD %13, %14 + %16:_(s32) = G_CONSTANT i32 0 + %17:_(s1) = G_ICMP intpred(eq), %15(s32), %16 + G_BRCOND %17(s1), %bb.2 + G_BR %bb.1 + + bb.2: + $x0 = COPY %6 + $x2 = COPY %7 + $x4 = COPY %11 + PseudoRET implicit $lr, implicit $x0, implicit $x2, implicit $x4 +... + +# Bare load at offset 0 is at the START of the access pattern (0, +128, +256). +# The PTR_ADD +0 should be chained and combined into post-increment loads. +--- +name: bare_load_start +legalized: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: bare_load_start + ; CHECK-NOT: G_PTR_ADD {{.*}}, {{.*}}i20 0 + bb.0: + successors: %bb.1 + liveins: $p0, $r0 + %0:_(p0) = COPY $p0 + %1:_(s32) = COPY $r0 + G_BR %bb.1 + + bb.1: + successors: %bb.2, %bb.1 + %3:_(p0) = G_PHI %10(p0), %bb.1, %0(p0), %bb.0 + %13:_(s32) = G_PHI %15(s32), %bb.1, %1(s32), %bb.0 + %7:_(<16 x s32>) = G_LOAD %3(p0) :: (load (<16 x s32>), align 64) + %8:_(s20) = G_CONSTANT i20 128 + %5:_(p0) = G_PTR_ADD %3, %8(s20) + %6:_(<16 x s32>) = G_LOAD %5(p0) :: (load (<16 x s32>), align 64) + %18:_(s20) = G_CONSTANT i20 256 + %9:_(p0) = G_PTR_ADD %3, %18(s20) + %11:_(<16 x s32>) = G_LOAD %9(p0) :: (load (<16 x s32>), align 64) + %12:_(s20) = G_CONSTANT i20 512 + %10:_(p0) = G_PTR_ADD %3, %12(s20) + %14:_(s32) = G_CONSTANT i32 -1 + %15:_(s32) = G_ADD %13, %14 + %16:_(s32) = G_CONSTANT i32 0 + %17:_(s1) = G_ICMP intpred(eq), %15(s32), %16 + G_BRCOND %17(s1), %bb.2 + G_BR %bb.1 + + bb.2: + $x0 = COPY %7 + $x2 = COPY %6 + $x4 = COPY %11 + PseudoRET implicit $lr, implicit $x0, implicit $x2, implicit $x4 +... + +# Bare load at offset 0 is at the END of the access pattern (-256, -128, 0). +# The PTR_ADD +0 should be chained and combined into post-increment loads. +--- +name: bare_load_end +legalized: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: bare_load_end + ; CHECK-NOT: G_PTR_ADD {{.*}}, {{.*}}i20 0 + bb.0: + successors: %bb.1 + liveins: $p0, $r0 + %0:_(p0) = COPY $p0 + %1:_(s32) = COPY $r0 + %100:_(s20) = G_CONSTANT i20 512 + %2:_(p0) = G_PTR_ADD %0, %100(s20) + G_BR %bb.1 + + bb.1: + successors: %bb.2, %bb.1 + %3:_(p0) = G_PHI %10(p0), %bb.1, %2(p0), %bb.0 + %13:_(s32) = G_PHI %15(s32), %bb.1, %1(s32), %bb.0 + %4:_(s20) = G_CONSTANT i20 -256 + %5:_(p0) = G_PTR_ADD %3, %4(s20) + %6:_(<16 x s32>) = G_LOAD %5(p0) :: (load (<16 x s32>), align 64) + %18:_(s20) = G_CONSTANT i20 -128 + %19:_(p0) = G_PTR_ADD %3, %18(s20) + %20:_(<16 x s32>) = G_LOAD %19(p0) :: (load (<16 x s32>), align 64) + %7:_(<16 x s32>) = G_LOAD %3(p0) :: (load (<16 x s32>), align 64) + %12:_(s20) = G_CONSTANT i20 512 + %10:_(p0) = G_PTR_ADD %3, %12(s20) + %14:_(s32) = G_CONSTANT i32 -1 + %15:_(s32) = G_ADD %13, %14 + %16:_(s32) = G_CONSTANT i32 0 + %17:_(s1) = G_ICMP intpred(eq), %15(s32), %16 + G_BRCOND %17(s1), %bb.2 + G_BR %bb.1 + + bb.2: + $x0 = COPY %6 + $x2 = COPY %20 + $x4 = COPY %7 + PseudoRET implicit $lr, implicit $x0, implicit $x2, implicit $x4 +... diff --git a/llvm/test/CodeGen/AIE/GlobalISel/cluster-base-address-bare-memop.mir b/llvm/test/CodeGen/AIE/GlobalISel/cluster-base-address-bare-memop.mir new file mode 100644 index 000000000000..f531ef2ebe57 --- /dev/null +++ b/llvm/test/CodeGen/AIE/GlobalISel/cluster-base-address-bare-memop.mir @@ -0,0 +1,363 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates +# RUN: llc -mtriple aie2 -run-pass=aie-cluster-base-address %s -verify-machineinstrs -o - | FileCheck %s + +# Test that bare load/store instructions using a PHI-defined pointer (offset 0) +# get a G_PTR_ADD +0 inserted when the PHI also has G_PTR_ADD users whose +# outputs feed load/store instructions, enabling post-increment chaining. + +# Positive test: PHI with 3 PTR_ADD users feeding loads (-128, +128, +512), +# plus a bare load at offset 0. The bare load gets PTR_ADD +0 and all loads +# end up chained. +--- +name: phi_load_bare_offset_zero +legalized: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: phi_load_bare_offset_zero + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0, $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $r0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 256 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20) + ; CHECK-NEXT: G_BR %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(p0) = G_PHI %5(p0), %bb.1, [[PTR_ADD]](p0), %bb.0 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[COPY1]](s32), %bb.0 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 -128 + ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[PHI]], [[C1]](s20) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p0) :: (load (<16 x s32>)) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 0 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s20) = G_CONSTANT i20 128 + ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C3]](s20) + ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD2]](p0) :: (load (<16 x s32>)) + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s20) = G_CONSTANT i20 128 + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s20) = G_CONSTANT i20 128 + ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD2]], [[C5]](s20) + ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD3]](p0) :: (load (<16 x s32>)) + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s20) = G_CONSTANT i20 512 + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s20) = G_CONSTANT i20 384 + ; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C7]](s20) + ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C8]] + ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C9]] + ; CHECK-NEXT: G_BRCOND [[ICMP]](s1), %bb.2 + ; CHECK-NEXT: G_BR %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: $x0 = COPY [[LOAD]](<16 x s32>) + ; CHECK-NEXT: $x2 = COPY [[LOAD1]](<16 x s32>) + ; CHECK-NEXT: $x4 = COPY [[LOAD2]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0, implicit $x2, implicit $x4 + bb.0: + successors: %bb.1 + liveins: $p0, $r0 + %0:_(p0) = COPY $p0 + %1:_(s32) = COPY $r0 + %100:_(s20) = G_CONSTANT i20 256 + %2:_(p0) = G_PTR_ADD %0, %100(s20) + G_BR %bb.1 + + bb.1: + successors: %bb.2, %bb.1 + %3:_(p0) = G_PHI %10(p0), %bb.1, %2(p0), %bb.0 + %13:_(s32) = G_PHI %15(s32), %bb.1, %1(s32), %bb.0 + %4:_(s20) = G_CONSTANT i20 -128 + %5:_(p0) = G_PTR_ADD %3, %4(s20) + %6:_(<16 x s32>) = G_LOAD %5(p0) :: (load (<16 x s32>), align 64) + %7:_(<16 x s32>) = G_LOAD %3(p0) :: (load (<16 x s32>), align 64) + %8:_(s20) = G_CONSTANT i20 128 + %9:_(p0) = G_PTR_ADD %3, %8(s20) + %11:_(<16 x s32>) = G_LOAD %9(p0) :: (load (<16 x s32>), align 64) + %12:_(s20) = G_CONSTANT i20 512 + %10:_(p0) = G_PTR_ADD %3, %12(s20) + %14:_(s32) = G_CONSTANT i32 -1 + %15:_(s32) = G_ADD %13, %14 + %16:_(s32) = G_CONSTANT i32 0 + %17:_(s1) = G_ICMP intpred(eq), %15(s32), %16 + G_BRCOND %17(s1), %bb.2 + G_BR %bb.1 + + bb.2: + $x0 = COPY %6 + $x2 = COPY %7 + $x4 = COPY %11 + PseudoRET implicit $lr, implicit $x0, implicit $x2, implicit $x4 +... + +# Negative test: PHI has only 1 PTR_ADD feeding a load (the other is the +# loop increment). No PTR_ADD +0 should be inserted. +--- +name: phi_load_single_ptr_add_feeding_load +legalized: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: phi_load_single_ptr_add_feeding_load + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0, $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $r0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 256 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20) + ; CHECK-NEXT: G_BR %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(p0) = G_PHI %5(p0), %bb.1, [[PTR_ADD]](p0), %bb.0 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[COPY1]](s32), %bb.0 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PHI]](p0) :: (load (<16 x s32>)) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 128 + ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[PHI]], [[C1]](s20) + ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p0) :: (load (<16 x s32>)) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 256 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s20) = G_CONSTANT i20 128 + ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C3]](s20) + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C4]] + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C5]] + ; CHECK-NEXT: G_BRCOND [[ICMP]](s1), %bb.2 + ; CHECK-NEXT: G_BR %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: $x0 = COPY [[LOAD]](<16 x s32>) + ; CHECK-NEXT: $x2 = COPY [[LOAD1]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0, implicit $x2 + bb.0: + successors: %bb.1 + liveins: $p0, $r0 + %0:_(p0) = COPY $p0 + %1:_(s32) = COPY $r0 + %100:_(s20) = G_CONSTANT i20 256 + %2:_(p0) = G_PTR_ADD %0, %100(s20) + G_BR %bb.1 + + bb.1: + successors: %bb.2, %bb.1 + %3:_(p0) = G_PHI %10(p0), %bb.1, %2(p0), %bb.0 + %13:_(s32) = G_PHI %15(s32), %bb.1, %1(s32), %bb.0 + %7:_(<16 x s32>) = G_LOAD %3(p0) :: (load (<16 x s32>), align 64) + %8:_(s20) = G_CONSTANT i20 128 + %9:_(p0) = G_PTR_ADD %3, %8(s20) + %11:_(<16 x s32>) = G_LOAD %9(p0) :: (load (<16 x s32>), align 64) + %12:_(s20) = G_CONSTANT i20 256 + %10:_(p0) = G_PTR_ADD %3, %12(s20) + %14:_(s32) = G_CONSTANT i32 -1 + %15:_(s32) = G_ADD %13, %14 + %16:_(s32) = G_CONSTANT i32 0 + %17:_(s1) = G_ICMP intpred(eq), %15(s32), %16 + G_BRCOND %17(s1), %bb.2 + G_BR %bb.1 + + bb.2: + $x0 = COPY %7 + $x2 = COPY %11 + PseudoRET implicit $lr, implicit $x0, implicit $x2 +... + +# Negative test: no PTR_ADD +0 when address is not PHI-defined. +--- +name: non_phi_load_no_insert +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $p0 + ; CHECK-LABEL: name: non_phi_load_no_insert + ; CHECK: liveins: $p0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 -64 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<16 x s32>)) + ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p0) :: (load (<16 x s32>)) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 128 + ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD]], [[C2]](s20) + ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p0) :: (load (<16 x s32>)) + ; CHECK-NEXT: $x0 = COPY [[LOAD]](<16 x s32>) + ; CHECK-NEXT: $x2 = COPY [[LOAD1]](<16 x s32>) + ; CHECK-NEXT: $x4 = COPY [[LOAD2]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0, implicit $x2, implicit $x4 + %0:_(p0) = COPY $p0 + %1:_(s20) = G_CONSTANT i20 -64 + %2:_(p0) = G_PTR_ADD %0, %1(s20) + %3:_(<16 x s32>) = G_LOAD %2(p0) :: (load (<16 x s32>), align 64) + %4:_(<16 x s32>) = G_LOAD %0(p0) :: (load (<16 x s32>), align 64) + %5:_(s20) = G_CONSTANT i20 64 + %6:_(p0) = G_PTR_ADD %0, %5(s20) + %7:_(<16 x s32>) = G_LOAD %6(p0) :: (load (<16 x s32>), align 64) + $x0 = COPY %3 + $x2 = COPY %4 + $x4 = COPY %7 + PseudoRET implicit $lr, implicit $x0, implicit $x2, implicit $x4 +... + +# Positive test: store at offset 0 from PHI also gets PTR_ADDs feeding stores. +--- +name: phi_store_bare_offset_zero +legalized: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: phi_store_bare_offset_zero + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0, $r0, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $r0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<8 x s64>) = COPY $x0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 128 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20) + ; CHECK-NEXT: G_BR %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(p0) = G_PHI %6(p0), %bb.1, [[PTR_ADD]](p0), %bb.0 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %8(s32), %bb.1, [[COPY1]](s32), %bb.0 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 -64 + ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[PHI]], [[C1]](s20) + ; CHECK-NEXT: G_STORE [[COPY2]](<8 x s64>), [[PTR_ADD1]](p0) :: (store (<8 x s64>)) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 0 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 + ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C3]](s20) + ; CHECK-NEXT: G_STORE [[COPY2]](<8 x s64>), [[PTR_ADD2]](p0) :: (store (<8 x s64>)) + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s20) = G_CONSTANT i20 64 + ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD2]], [[C5]](s20) + ; CHECK-NEXT: G_STORE [[COPY2]](<8 x s64>), [[PTR_ADD3]](p0) :: (store (<8 x s64>)) + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s20) = G_CONSTANT i20 256 + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s20) = G_CONSTANT i20 192 + ; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C7]](s20) + ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C8]] + ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C9]] + ; CHECK-NEXT: G_BRCOND [[ICMP]](s1), %bb.2 + ; CHECK-NEXT: G_BR %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: PseudoRET implicit $lr + bb.0: + successors: %bb.1 + liveins: $p0, $r0, $x0 + %0:_(p0) = COPY $p0 + %1:_(s32) = COPY $r0 + %20:_(<8 x s64>) = COPY $x0 + %100:_(s20) = G_CONSTANT i20 128 + %2:_(p0) = G_PTR_ADD %0, %100(s20) + G_BR %bb.1 + + bb.1: + successors: %bb.2, %bb.1 + %3:_(p0) = G_PHI %10(p0), %bb.1, %2(p0), %bb.0 + %13:_(s32) = G_PHI %15(s32), %bb.1, %1(s32), %bb.0 + %4:_(s20) = G_CONSTANT i20 -64 + %5:_(p0) = G_PTR_ADD %3, %4(s20) + G_STORE %20(<8 x s64>), %5(p0) :: (store (<8 x s64>), align 64) + G_STORE %20(<8 x s64>), %3(p0) :: (store (<8 x s64>), align 64) + %8:_(s20) = G_CONSTANT i20 64 + %9:_(p0) = G_PTR_ADD %3, %8(s20) + G_STORE %20(<8 x s64>), %9(p0) :: (store (<8 x s64>), align 64) + %12:_(s20) = G_CONSTANT i20 256 + %10:_(p0) = G_PTR_ADD %3, %12(s20) + %14:_(s32) = G_CONSTANT i32 -1 + %15:_(s32) = G_ADD %13, %14 + %16:_(s32) = G_CONSTANT i32 0 + %17:_(s1) = G_ICMP intpred(eq), %15(s32), %16 + G_BRCOND %17(s1), %bb.2 + G_BR %bb.1 + + bb.2: + PseudoRET implicit $lr +... + +# Negative test: bare load from PHI appears BEFORE all G_PTR_ADDs that feed +# loads. The bare load is the first user of the PHI pointer and should NOT +# get G_PTR_ADD +0 inserted, because it can be combined with a post-increment +# update (e.g., add.2d/add.3d) into a single post-increment load instruction. +--- +name: phi_bare_load_before_ptr_adds_no_insert +legalized: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: phi_bare_load_before_ptr_adds_no_insert + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0, $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $p0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $r0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s20) = G_CONSTANT i20 256 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s20) + ; CHECK-NEXT: G_BR %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(p0) = G_PHI %5(p0), %bb.1, [[PTR_ADD]](p0), %bb.0 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[COPY1]](s32), %bb.0 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PHI]](p0) :: (load (<16 x s32>)) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 128 + ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[PHI]], [[C1]](s20) + ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p0) :: (load (<16 x s32>)) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 256 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s20) = G_CONSTANT i20 128 + ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C3]](s20) + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C4]] + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C5]] + ; CHECK-NEXT: G_BRCOND [[ICMP]](s1), %bb.2 + ; CHECK-NEXT: G_BR %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: $x0 = COPY [[LOAD]](<16 x s32>) + ; CHECK-NEXT: $x2 = COPY [[LOAD1]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0, implicit $x2 + bb.0: + successors: %bb.1 + liveins: $p0, $r0 + %0:_(p0) = COPY $p0 + %1:_(s32) = COPY $r0 + %100:_(s20) = G_CONSTANT i20 256 + %2:_(p0) = G_PTR_ADD %0, %100(s20) + G_BR %bb.1 + + bb.1: + successors: %bb.2, %bb.1 + %3:_(p0) = G_PHI %10(p0), %bb.1, %2(p0), %bb.0 + %13:_(s32) = G_PHI %15(s32), %bb.1, %1(s32), %bb.0 + %7:_(<16 x s32>) = G_LOAD %3(p0) :: (load (<16 x s32>), align 64) + %8:_(s20) = G_CONSTANT i20 128 + %9:_(p0) = G_PTR_ADD %3, %8(s20) + %11:_(<16 x s32>) = G_LOAD %9(p0) :: (load (<16 x s32>), align 64) + %12:_(s20) = G_CONSTANT i20 256 + %10:_(p0) = G_PTR_ADD %3, %12(s20) + %14:_(s32) = G_CONSTANT i32 -1 + %15:_(s32) = G_ADD %13, %14 + %16:_(s32) = G_CONSTANT i32 0 + %17:_(s1) = G_ICMP intpred(eq), %15(s32), %16 + G_BRCOND %17(s1), %bb.2 + G_BR %bb.1 + + bb.2: + $x0 = COPY %7 + $x2 = COPY %11 + PseudoRET implicit $lr, implicit $x0, implicit $x2 +... diff --git a/llvm/test/CodeGen/AIE/hardware-loops/irtranslator-zol.ll b/llvm/test/CodeGen/AIE/hardware-loops/irtranslator-zol.ll index 0dfde4656137..bee67bf03f78 100644 --- a/llvm/test/CodeGen/AIE/hardware-loops/irtranslator-zol.ll +++ b/llvm/test/CodeGen/AIE/hardware-loops/irtranslator-zol.ll @@ -4,7 +4,7 @@ ; See https://llvm.org/LICENSE.txt for license information. ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception ; -; (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates +; (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates ; RUN: llc -O2 -mtriple=aie2 -stop-after=irtranslator --enable-aie-hardware-loops --enable-aie-zero-overhead-loops \ ; RUN: --aie-force-hl-gen=true %s -o - | FileCheck %s --check-prefix=AIE2 @@ -25,7 +25,6 @@ define void @simple(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef ; AIE2-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1 ; AIE2-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $r0 ; AIE2-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; AIE2-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; AIE2-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32) from %ir.out) ; AIE2-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.set.loop.iterations), [[COPY2]](s32) ; AIE2-NEXT: G_BR %bb.3 @@ -36,17 +35,13 @@ define void @simple(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef ; AIE2-NEXT: bb.3.for.body: ; AIE2-NEXT: successors: %bb.3(0x7c000000), %bb.2(0x04000000) ; AIE2-NEXT: {{ $}} - ; AIE2-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[LOAD]](s32), %bb.1, %12(s32), %bb.3 - ; AIE2-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1, %14(s32), %bb.3 - ; AIE2-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[PHI1]](s32) - ; AIE2-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 4 - ; AIE2-NEXT: [[MUL:%[0-9]+]]:_(s20) = G_MUL [[TRUNC]], [[C2]] - ; AIE2-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[MUL]](s20) - ; AIE2-NEXT: [[COPY3:%[0-9]+]]:_(p0) = COPY [[PTR_ADD]](p0) - ; AIE2-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[COPY3]](p0) :: (load (s32) from %ir.arrayidx) - ; AIE2-NEXT: [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD [[PHI]], [[LOAD1]] + ; AIE2-NEXT: [[PHI:%[0-9]+]]:_(p0) = G_PHI %9(p0), %bb.3, [[COPY1]](p0), %bb.1 + ; AIE2-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[LOAD]](s32), %bb.1, %7(s32), %bb.3 + ; AIE2-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PHI]](p0) :: (load (s32) from %ir.lsr.iv1) + ; AIE2-NEXT: [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD [[PHI1]], [[LOAD1]] ; AIE2-NEXT: G_STORE [[ADD]](s32), [[COPY]](p0) :: (store (s32) into %ir.out) - ; AIE2-NEXT: [[ADD1:%[0-9]+]]:_(s32) = nuw nsw G_ADD [[PHI1]], [[C]] + ; AIE2-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 4 + ; AIE2-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[PHI]], [[C1]](s20) ; AIE2-NEXT: [[INT:%[0-9]+]]:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.loop.decrement), [[C]](s32) ; AIE2-NEXT: G_BRCOND [[INT]](s1), %bb.3 ; AIE2-NEXT: G_BR %bb.2 @@ -60,7 +55,6 @@ define void @simple(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef ; AIE2p-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $p1 ; AIE2p-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $r0 ; AIE2p-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; AIE2p-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; AIE2p-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32) from %ir.out) ; AIE2p-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.set.loop.iterations), [[COPY2]](s32) ; AIE2p-NEXT: G_BR %bb.3 @@ -71,17 +65,13 @@ define void @simple(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef ; AIE2p-NEXT: bb.3.for.body: ; AIE2p-NEXT: successors: %bb.3(0x7c000000), %bb.2(0x04000000) ; AIE2p-NEXT: {{ $}} - ; AIE2p-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[LOAD]](s32), %bb.1, %12(s32), %bb.3 - ; AIE2p-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1, %14(s32), %bb.3 - ; AIE2p-NEXT: [[TRUNC:%[0-9]+]]:_(s20) = G_TRUNC [[PHI1]](s32) - ; AIE2p-NEXT: [[C2:%[0-9]+]]:_(s20) = G_CONSTANT i20 4 - ; AIE2p-NEXT: [[MUL:%[0-9]+]]:_(s20) = G_MUL [[TRUNC]], [[C2]] - ; AIE2p-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[MUL]](s20) - ; AIE2p-NEXT: [[COPY3:%[0-9]+]]:_(p0) = COPY [[PTR_ADD]](p0) - ; AIE2p-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[COPY3]](p0) :: (load (s32) from %ir.arrayidx) - ; AIE2p-NEXT: [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD [[PHI]], [[LOAD1]] + ; AIE2p-NEXT: [[PHI:%[0-9]+]]:_(p0) = G_PHI %9(p0), %bb.3, [[COPY1]](p0), %bb.1 + ; AIE2p-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[LOAD]](s32), %bb.1, %7(s32), %bb.3 + ; AIE2p-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PHI]](p0) :: (load (s32) from %ir.lsr.iv1) + ; AIE2p-NEXT: [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD [[PHI1]], [[LOAD1]] ; AIE2p-NEXT: G_STORE [[ADD]](s32), [[COPY]](p0) :: (store (s32) into %ir.out) - ; AIE2p-NEXT: [[ADD1:%[0-9]+]]:_(s32) = nuw nsw G_ADD [[PHI1]], [[C]] + ; AIE2p-NEXT: [[C1:%[0-9]+]]:_(s20) = G_CONSTANT i20 4 + ; AIE2p-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[PHI]], [[C1]](s20) ; AIE2p-NEXT: [[INT:%[0-9]+]]:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.loop.decrement), [[C]](s32) ; AIE2p-NEXT: G_BRCOND [[INT]](s1), %bb.3 ; AIE2p-NEXT: G_BR %bb.2 diff --git a/llvm/test/CodeGen/AIE/hardware-loops/nested.ll b/llvm/test/CodeGen/AIE/hardware-loops/nested.ll index 39aa8210245f..0a9498dd5504 100644 --- a/llvm/test/CodeGen/AIE/hardware-loops/nested.ll +++ b/llvm/test/CodeGen/AIE/hardware-loops/nested.ll @@ -22,45 +22,39 @@ define void @nested(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef %size, i32 noundef %size2) { ; AIE2-LABEL: nested: ; AIE2: // %bb.0: // %for.cond3.preheader.lr.ph -; AIE2-NEXT: nopb ; mova r3, #0; nops ; nopxm ; nopv -; AIE2-NEXT: mova r4, #2; nopx -; AIE2-NEXT: movxm p2, #.LBB0_2 +; AIE2-NEXT: nopa ; nopb ; movxm p2, #.LBB0_2 ; AIE2-NEXT: lda r2, [p0, #0] ; AIE2-NEXT: .LBB0_1: // %for.cond3.preheader ; AIE2-NEXT: // =>This Loop Header: Depth=1 ; AIE2-NEXT: // Child Loop BB0_2 Depth 2 -; AIE2-NEXT: nopa ; lshl r5, r3, r4; nopm -; AIE2-NEXT: mov dj0, r5 -; AIE2-NEXT: lda p3, [p1, dj0] +; AIE2-NEXT: nopb ; lda p3, [p1, #0]; nops ; nopxm ; nopv +; AIE2-NEXT: nopx ; AIE2-NEXT: nop ; AIE2-NEXT: nop ; AIE2-NEXT: nop -; AIE2-NEXT: mova r6, #0 -; AIE2-NEXT: add.nc r5, r1, #-1 +; AIE2-NEXT: nop +; AIE2-NEXT: add.nc r3, r1, #-1 ; AIE2-NEXT: .LBB0_2: // %for.body6 ; AIE2-NEXT: // Parent Loop BB0_1 Depth=1 ; AIE2-NEXT: // => This Inner Loop Header: Depth=2 -; AIE2-NEXT: nopa ; lshl r7, r6, r4; nopm -; AIE2-NEXT: mov dj0, r7 -; AIE2-NEXT: lda r7, [p3, dj0] +; AIE2-NEXT: lda r4, [p3], #4; nopx ; AIE2-NEXT: nop ; AIE2-NEXT: nop -; AIE2-NEXT: jnzd r5, r5, p2 +; AIE2-NEXT: jnzd r3, r3, p2 ; AIE2-NEXT: nop // Delay Slot 5 ; AIE2-NEXT: nop // Delay Slot 4 -; AIE2-NEXT: add r6, r6, #1 // Delay Slot 3 -; AIE2-NEXT: add r2, r2, r7 // Delay Slot 2 +; AIE2-NEXT: nop // Delay Slot 3 +; AIE2-NEXT: add r2, r2, r4 // Delay Slot 2 ; AIE2-NEXT: st r2, [p0, #0] // Delay Slot 1 ; AIE2-NEXT: // %bb.3: // %for.cond3.for.cond.cleanup5_crit_edge ; AIE2-NEXT: // in Loop: Header=BB0_1 Depth=1 -; AIE2-NEXT: add r3, r3, #1 -; AIE2-NEXT: eq r5, r0, r3 -; AIE2-NEXT: jz r5, #.LBB0_1 +; AIE2-NEXT: add r0, r0, #-1 +; AIE2-NEXT: jnz r0, #.LBB0_1 ; AIE2-NEXT: nop // Delay Slot 5 ; AIE2-NEXT: nop // Delay Slot 4 ; AIE2-NEXT: nop // Delay Slot 3 ; AIE2-NEXT: nop // Delay Slot 2 -; AIE2-NEXT: nop // Delay Slot 1 +; AIE2-NEXT: paddb [p1], #4 // Delay Slot 1 ; AIE2-NEXT: // %bb.4: // %for.cond.cleanup ; AIE2-NEXT: ret lr ; AIE2-NEXT: nop // Delay Slot 5 @@ -71,44 +65,39 @@ define void @nested(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef ; ; AIE2P-LABEL: nested: ; AIE2P: // %bb.0: // %for.cond3.preheader.lr.ph -; AIE2P-NEXT: mova r3, #0; nopb ; nops ; nopxm ; nopv -; AIE2P-NEXT: mova r4, #2; nopx ; AIE2P-NEXT: movxm p2, #.LBB0_2 +; AIE2P-NEXT: mova m0, #4; nopx ; AIE2P-NEXT: lda r2, [p0, #0] ; AIE2P-NEXT: .LBB0_1: // %for.cond3.preheader ; AIE2P-NEXT: // =>This Loop Header: Depth=1 ; AIE2P-NEXT: // Child Loop BB0_2 Depth 2 -; AIE2P-NEXT: nopa ; lshl r5, r3, r4; nopm -; AIE2P-NEXT: mov dj0, r5 -; AIE2P-NEXT: lda p3, [p1, dj0] +; AIE2P-NEXT: lda p3, [p1, #0]; nopb ; nops ; nopxm ; nopv +; AIE2P-NEXT: nopx ; AIE2P-NEXT: nop ; AIE2P-NEXT: nop ; AIE2P-NEXT: nop -; AIE2P-NEXT: mova r6, #0 -; AIE2P-NEXT: add.nc r5, r1, #-1 +; AIE2P-NEXT: nop +; AIE2P-NEXT: add.nc r3, r1, #-1 ; AIE2P-NEXT: .LBB0_2: // %for.body6 ; AIE2P-NEXT: // Parent Loop BB0_1 Depth=1 ; AIE2P-NEXT: // => This Inner Loop Header: Depth=2 -; AIE2P-NEXT: nopa ; lshl r7, r6, r4; nopm -; AIE2P-NEXT: mov dj0, r7 -; AIE2P-NEXT: lda r7, [p3, dj0] +; AIE2P-NEXT: lda r4, [p3], #4; nopx ; AIE2P-NEXT: nop ; AIE2P-NEXT: nop -; AIE2P-NEXT: jnzd r5, r5, p2 +; AIE2P-NEXT: jnzd r3, r3, p2 ; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 -; AIE2P-NEXT: add r6, r6, #1 // Delay Slot 3 -; AIE2P-NEXT: add r2, r2, r7 // Delay Slot 2 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: add r2, r2, r4 // Delay Slot 2 ; AIE2P-NEXT: st r2, [p0, #0] // Delay Slot 1 ; AIE2P-NEXT: // %bb.3: // %for.cond3.for.cond.cleanup5_crit_edge ; AIE2P-NEXT: // in Loop: Header=BB0_1 Depth=1 -; AIE2P-NEXT: add r3, r3, #1 -; AIE2P-NEXT: eq r5, r0, r3 -; AIE2P-NEXT: jz r5, #.LBB0_1 +; AIE2P-NEXT: add r0, r0, #-1 +; AIE2P-NEXT: jnz r0, #.LBB0_1 ; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 ; AIE2P-NEXT: nop // Delay Slot 3 -; AIE2P-NEXT: nop // Delay Slot 2 +; AIE2P-NEXT: padda [p1], m0 // Delay Slot 2 ; AIE2P-NEXT: nop // Delay Slot 1 ; AIE2P-NEXT: // %bb.4: // %for.cond.cleanup ; AIE2P-NEXT: ret lr @@ -120,45 +109,40 @@ define void @nested(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef ; ; AIE2PS-LABEL: nested: ; AIE2PS: // %bb.0: // %for.cond3.preheader.lr.ph -; AIE2PS-NEXT: mova r4, #0; nopb ; nops ; nopxm ; nopv -; AIE2PS-NEXT: mova r6, #2; nopx ; AIE2PS-NEXT: movxm p2, #.LBB0_2 +; AIE2PS-NEXT: mova m0, #4; nopx ; AIE2PS-NEXT: lda r2, [p0, #0] ; AIE2PS-NEXT: .LBB0_1: // %for.cond3.preheader ; AIE2PS-NEXT: // =>This Loop Header: Depth=1 ; AIE2PS-NEXT: // Child Loop BB0_2 Depth 2 -; AIE2PS-NEXT: nopa ; lshl r16, r4, r6; nopm -; AIE2PS-NEXT: mov dj0, r16 -; AIE2PS-NEXT: lda p3, [p1, dj0] +; AIE2PS-NEXT: lda p3, [p1, #0]; nopb ; nops ; nopxm ; nopv +; AIE2PS-NEXT: nopx +; AIE2PS-NEXT: nop ; AIE2PS-NEXT: nop ; AIE2PS-NEXT: nop ; AIE2PS-NEXT: nop ; AIE2PS-NEXT: addm.nc r3, r1, #-1 -; AIE2PS-NEXT: mova r16, #0 ; AIE2PS-NEXT: .LBB0_2: // %for.body6 ; AIE2PS-NEXT: // Parent Loop BB0_1 Depth=1 ; AIE2PS-NEXT: // => This Inner Loop Header: Depth=2 -; AIE2PS-NEXT: nopa ; lshl r18, r16, r6; nopm -; AIE2PS-NEXT: mov dj0, r18 -; AIE2PS-NEXT: lda r18, [p3, dj0] +; AIE2PS-NEXT: lda r4, [p3], #4; nopx ; AIE2PS-NEXT: nop ; AIE2PS-NEXT: nop ; AIE2PS-NEXT: jnzd r3, r3, p2 ; AIE2PS-NEXT: nop // Delay Slot 5 ; AIE2PS-NEXT: nop // Delay Slot 4 -; AIE2PS-NEXT: add r16, r16, #1 // Delay Slot 3 -; AIE2PS-NEXT: add r2, r2, r18 // Delay Slot 2 +; AIE2PS-NEXT: nop // Delay Slot 3 +; AIE2PS-NEXT: add r2, r2, r4 // Delay Slot 2 ; AIE2PS-NEXT: st r2, [p0, #0] // Delay Slot 1 ; AIE2PS-NEXT: // %bb.3: // %for.cond3.for.cond.cleanup5_crit_edge ; AIE2PS-NEXT: // in Loop: Header=BB0_1 Depth=1 -; AIE2PS-NEXT: add r4, r4, #1 -; AIE2PS-NEXT: eq r16, r0, r4 -; AIE2PS-NEXT: jz r16, #.LBB0_1 +; AIE2PS-NEXT: add r0, r0, #-1 +; AIE2PS-NEXT: jnz r0, #.LBB0_1 ; AIE2PS-NEXT: nop // Delay Slot 5 ; AIE2PS-NEXT: nop // Delay Slot 4 ; AIE2PS-NEXT: nop // Delay Slot 3 ; AIE2PS-NEXT: nop // Delay Slot 2 -; AIE2PS-NEXT: nop // Delay Slot 1 +; AIE2PS-NEXT: padda [p1], m0 // Delay Slot 1 ; AIE2PS-NEXT: // %bb.4: // %for.cond.cleanup ; AIE2PS-NEXT: ret lr ; AIE2PS-NEXT: nop // Delay Slot 5 diff --git a/llvm/test/CodeGen/AIE/hardware-loops/sibling.ll b/llvm/test/CodeGen/AIE/hardware-loops/sibling.ll index 3a5ea5cf9763..c1bff8247501 100644 --- a/llvm/test/CodeGen/AIE/hardware-loops/sibling.ll +++ b/llvm/test/CodeGen/AIE/hardware-loops/sibling.ll @@ -16,42 +16,35 @@ define void @sibling(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef %size, i32 noundef %size2) { ; AIE2-LABEL: sibling: ; AIE2: // %bb.0: // %for.body.lr.ph -; AIE2-NEXT: mova r2, #0; nopxm -; AIE2-NEXT: add.nc r0, r0, #-1 -; AIE2-NEXT: mova r4, #2 -; AIE2-NEXT: movxm p2, #.LBB0_1 -; AIE2-NEXT: mova r5, #0 -; AIE2-NEXT: lda r3, [p0, #0] +; AIE2-NEXT: nopb ; nopa ; nops ; nopx ; add.nc r0, r0, #-1; nopv +; AIE2-NEXT: nop ; movxm p2, #.LBB0_1 +; AIE2-NEXT: mov p3, p1 +; AIE2-NEXT: lda r2, [p0, #0] ; AIE2-NEXT: .LBB0_1: // %for.body ; AIE2-NEXT: // =>This Inner Loop Header: Depth=1 -; AIE2-NEXT: nopa ; nopb ; lshl r6, r5, r4; nopm ; nops -; AIE2-NEXT: mov dj0, r6 -; AIE2-NEXT: lda r6, [p1, dj0] +; AIE2-NEXT: lda r3, [p3], #4; nopb ; nopxm ; AIE2-NEXT: nop ; AIE2-NEXT: nop ; AIE2-NEXT: jnzd r0, r0, p2 ; AIE2-NEXT: nop // Delay Slot 5 ; AIE2-NEXT: nop // Delay Slot 4 -; AIE2-NEXT: add r5, r5, #1 // Delay Slot 3 -; AIE2-NEXT: add r3, r3, r6 // Delay Slot 2 -; AIE2-NEXT: st r3, [p0, #0] // Delay Slot 1 +; AIE2-NEXT: nop // Delay Slot 3 +; AIE2-NEXT: add r2, r2, r3 // Delay Slot 2 +; AIE2-NEXT: st r2, [p0, #0] // Delay Slot 1 ; AIE2-NEXT: // %bb.2: // %for.body6.lr.ph ; AIE2-NEXT: add.nc r1, r1, #-1 -; AIE2-NEXT: mova r3, #2 ; AIE2-NEXT: movxm p2, #.LBB0_3 ; AIE2-NEXT: lda r0, [p0, #0] ; AIE2-NEXT: .LBB0_3: // %for.body6 ; AIE2-NEXT: // =>This Inner Loop Header: Depth=1 -; AIE2-NEXT: nopb ; nopa ; nops ; lshl r4, r2, r3; nopm ; nopv -; AIE2-NEXT: nopa ; mov dj0, r4 -; AIE2-NEXT: lda r4, [p1, dj0] +; AIE2-NEXT: lda r2, [p1], #4; nopb ; nopxm ; AIE2-NEXT: nop ; AIE2-NEXT: nop ; AIE2-NEXT: jnzd r1, r1, p2 ; AIE2-NEXT: nop // Delay Slot 5 ; AIE2-NEXT: nop // Delay Slot 4 -; AIE2-NEXT: add r2, r2, #1 // Delay Slot 3 -; AIE2-NEXT: add r0, r0, r4 // Delay Slot 2 +; AIE2-NEXT: nop // Delay Slot 3 +; AIE2-NEXT: add r0, r0, r2 // Delay Slot 2 ; AIE2-NEXT: st r0, [p0, #0] // Delay Slot 1 ; AIE2-NEXT: // %bb.4: // %for.cond.cleanup5 ; AIE2-NEXT: ret lr @@ -63,42 +56,35 @@ define void @sibling(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef ; ; AIE2P-LABEL: sibling: ; AIE2P: // %bb.0: // %for.body.lr.ph -; AIE2P-NEXT: mova r2, #0; nopxm -; AIE2P-NEXT: add.nc r0, r0, #-1 -; AIE2P-NEXT: mova r4, #2 +; AIE2P-NEXT: nopa ; nopb ; nops ; nopx ; add.nc r0, r0, #-1; nopv ; AIE2P-NEXT: movxm p2, #.LBB0_1 -; AIE2P-NEXT: mova r5, #0 -; AIE2P-NEXT: lda r3, [p0, #0] +; AIE2P-NEXT: nopx ; mov p3, p1 +; AIE2P-NEXT: lda r2, [p0, #0] ; AIE2P-NEXT: .LBB0_1: // %for.body ; AIE2P-NEXT: // =>This Inner Loop Header: Depth=1 -; AIE2P-NEXT: nopa ; nopb ; lshl r6, r5, r4; nopm ; nops -; AIE2P-NEXT: mov dj0, r6 -; AIE2P-NEXT: lda r6, [p1, dj0] +; AIE2P-NEXT: lda r3, [p3], #4; nopb ; nopxm ; AIE2P-NEXT: nop ; AIE2P-NEXT: nop ; AIE2P-NEXT: jnzd r0, r0, p2 ; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 -; AIE2P-NEXT: add r5, r5, #1 // Delay Slot 3 -; AIE2P-NEXT: add r3, r3, r6 // Delay Slot 2 -; AIE2P-NEXT: st r3, [p0, #0] // Delay Slot 1 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: add r2, r2, r3 // Delay Slot 2 +; AIE2P-NEXT: st r2, [p0, #0] // Delay Slot 1 ; AIE2P-NEXT: // %bb.2: // %for.body6.lr.ph ; AIE2P-NEXT: add.nc r1, r1, #-1 -; AIE2P-NEXT: mova r3, #2 ; AIE2P-NEXT: movxm p2, #.LBB0_3 ; AIE2P-NEXT: lda r0, [p0, #0] ; AIE2P-NEXT: .LBB0_3: // %for.body6 ; AIE2P-NEXT: // =>This Inner Loop Header: Depth=1 -; AIE2P-NEXT: nopa ; nopb ; nops ; lshl r4, r2, r3; nopm ; nopv -; AIE2P-NEXT: nopx ; mov dj0, r4 -; AIE2P-NEXT: lda r4, [p1, dj0] +; AIE2P-NEXT: lda r2, [p1], #4; nopb ; nopxm ; AIE2P-NEXT: nop ; AIE2P-NEXT: nop ; AIE2P-NEXT: jnzd r1, r1, p2 ; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 -; AIE2P-NEXT: add r2, r2, #1 // Delay Slot 3 -; AIE2P-NEXT: add r0, r0, r4 // Delay Slot 2 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: add r0, r0, r2 // Delay Slot 2 ; AIE2P-NEXT: st r0, [p0, #0] // Delay Slot 1 ; AIE2P-NEXT: // %bb.4: // %for.cond.cleanup5 ; AIE2P-NEXT: ret lr @@ -110,42 +96,35 @@ define void @sibling(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef ; ; AIE2PS-LABEL: sibling: ; AIE2PS: // %bb.0: // %for.body.lr.ph -; AIE2PS-NEXT: mova r2, #0; nopxm -; AIE2PS-NEXT: addm.nc r3, r0, #-1 -; AIE2PS-NEXT: mova r0, #2 +; AIE2PS-NEXT: nopa ; nopb ; nops ; nopx ; addm.nc r3, r0, #-1; nopv ; AIE2PS-NEXT: movxm p2, #.LBB0_1 -; AIE2PS-NEXT: mova r6, #0 -; AIE2PS-NEXT: lda r4, [p0, #0] +; AIE2PS-NEXT: nopx ; mov p3, p1 +; AIE2PS-NEXT: lda r2, [p0, #0] ; AIE2PS-NEXT: .LBB0_1: // %for.body ; AIE2PS-NEXT: // =>This Inner Loop Header: Depth=1 -; AIE2PS-NEXT: nopa ; nopb ; lshl r16, r6, r0; nopm ; nops -; AIE2PS-NEXT: mov dj0, r16 -; AIE2PS-NEXT: lda r16, [p1, dj0] +; AIE2PS-NEXT: lda r0, [p3], #4; nopb ; nopxm ; AIE2PS-NEXT: nop ; AIE2PS-NEXT: nop ; AIE2PS-NEXT: jnzd r3, r3, p2 ; AIE2PS-NEXT: nop // Delay Slot 5 ; AIE2PS-NEXT: nop // Delay Slot 4 -; AIE2PS-NEXT: add r6, r6, #1 // Delay Slot 3 -; AIE2PS-NEXT: add r4, r4, r16 // Delay Slot 2 -; AIE2PS-NEXT: st r4, [p0, #0] // Delay Slot 1 +; AIE2PS-NEXT: nop // Delay Slot 3 +; AIE2PS-NEXT: add r2, r2, r0 // Delay Slot 2 +; AIE2PS-NEXT: st r2, [p0, #0] // Delay Slot 1 ; AIE2PS-NEXT: // %bb.2: // %for.body6.lr.ph ; AIE2PS-NEXT: addm.nc r1, r1, #-1 -; AIE2PS-NEXT: mova r4, #2 ; AIE2PS-NEXT: movxm p2, #.LBB0_3 ; AIE2PS-NEXT: lda r0, [p0, #0] ; AIE2PS-NEXT: .LBB0_3: // %for.body6 ; AIE2PS-NEXT: // =>This Inner Loop Header: Depth=1 -; AIE2PS-NEXT: nopa ; nopb ; nops ; lshl r6, r2, r4; nopm ; nopv -; AIE2PS-NEXT: nopx ; mov dj0, r6 -; AIE2PS-NEXT: lda r6, [p1, dj0] +; AIE2PS-NEXT: lda r2, [p1], #4; nopb ; nopxm ; AIE2PS-NEXT: nop ; AIE2PS-NEXT: nop ; AIE2PS-NEXT: jnzd r1, r1, p2 ; AIE2PS-NEXT: nop // Delay Slot 5 ; AIE2PS-NEXT: nop // Delay Slot 4 -; AIE2PS-NEXT: add r2, r2, #1 // Delay Slot 3 -; AIE2PS-NEXT: add r0, r0, r6 // Delay Slot 2 +; AIE2PS-NEXT: nop // Delay Slot 3 +; AIE2PS-NEXT: add r0, r0, r2 // Delay Slot 2 ; AIE2PS-NEXT: st r0, [p0, #0] // Delay Slot 1 ; AIE2PS-NEXT: // %bb.4: // %for.cond.cleanup5 ; AIE2PS-NEXT: ret lr diff --git a/llvm/test/CodeGen/AIE/hardware-loops/simple.ll b/llvm/test/CodeGen/AIE/hardware-loops/simple.ll index 19be9728fcea..900a895568cb 100644 --- a/llvm/test/CodeGen/AIE/hardware-loops/simple.ll +++ b/llvm/test/CodeGen/AIE/hardware-loops/simple.ll @@ -16,23 +16,19 @@ define void @simple(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef %size) { ; AIE2-LABEL: simple: ; AIE2: // %bb.0: // %for.body.lr.ph -; AIE2-NEXT: mova r2, #0; nopb ; nopxm ; nops -; AIE2-NEXT: add.nc r0, r0, #-1 -; AIE2-NEXT: mova r3, #2 +; AIE2-NEXT: nopa ; add.nc r0, r0, #-1 ; AIE2-NEXT: movxm p2, #.LBB0_1 ; AIE2-NEXT: lda r1, [p0, #0] ; AIE2-NEXT: .LBB0_1: // %for.body ; AIE2-NEXT: // =>This Inner Loop Header: Depth=1 -; AIE2-NEXT: nopb ; nopa ; nops ; lshl r4, r2, r3; nopm ; nopv -; AIE2-NEXT: nopa ; mov dj0, r4 -; AIE2-NEXT: lda r4, [p1, dj0] +; AIE2-NEXT: lda r2, [p1], #4; nopb ; nopxm ; AIE2-NEXT: nop ; AIE2-NEXT: nop ; AIE2-NEXT: jnzd r0, r0, p2 ; AIE2-NEXT: nop // Delay Slot 5 ; AIE2-NEXT: nop // Delay Slot 4 -; AIE2-NEXT: add r2, r2, #1 // Delay Slot 3 -; AIE2-NEXT: add r1, r1, r4 // Delay Slot 2 +; AIE2-NEXT: nop // Delay Slot 3 +; AIE2-NEXT: add r1, r1, r2 // Delay Slot 2 ; AIE2-NEXT: st r1, [p0, #0] // Delay Slot 1 ; AIE2-NEXT: // %bb.2: // %for.cond.cleanup ; AIE2-NEXT: ret lr @@ -44,23 +40,19 @@ define void @simple(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef ; ; AIE2P-LABEL: simple: ; AIE2P: // %bb.0: // %for.body.lr.ph -; AIE2P-NEXT: mova r2, #0; nopb ; nopxm ; nops -; AIE2P-NEXT: add.nc r0, r0, #-1 -; AIE2P-NEXT: mova r3, #2 +; AIE2P-NEXT: nopx ; add.nc r0, r0, #-1 ; AIE2P-NEXT: movxm p2, #.LBB0_1 ; AIE2P-NEXT: lda r1, [p0, #0] ; AIE2P-NEXT: .LBB0_1: // %for.body ; AIE2P-NEXT: // =>This Inner Loop Header: Depth=1 -; AIE2P-NEXT: nopa ; nopb ; nops ; lshl r4, r2, r3; nopm ; nopv -; AIE2P-NEXT: nopx ; mov dj0, r4 -; AIE2P-NEXT: lda r4, [p1, dj0] +; AIE2P-NEXT: lda r2, [p1], #4; nopb ; nopxm ; AIE2P-NEXT: nop ; AIE2P-NEXT: nop ; AIE2P-NEXT: jnzd r0, r0, p2 ; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 -; AIE2P-NEXT: add r2, r2, #1 // Delay Slot 3 -; AIE2P-NEXT: add r1, r1, r4 // Delay Slot 2 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: add r1, r1, r2 // Delay Slot 2 ; AIE2P-NEXT: st r1, [p0, #0] // Delay Slot 1 ; AIE2P-NEXT: // %bb.2: // %for.cond.cleanup ; AIE2P-NEXT: ret lr @@ -72,23 +64,19 @@ define void @simple(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef ; ; AIE2PS-LABEL: simple: ; AIE2PS: // %bb.0: // %for.body.lr.ph -; AIE2PS-NEXT: mova r4, #0; nopb ; nopxm ; nops -; AIE2PS-NEXT: addm.nc r1, r0, #-1 -; AIE2PS-NEXT: mova r0, #2 +; AIE2PS-NEXT: nopx ; addm.nc r1, r0, #-1 ; AIE2PS-NEXT: movxm p2, #.LBB0_1 ; AIE2PS-NEXT: lda r2, [p0, #0] ; AIE2PS-NEXT: .LBB0_1: // %for.body ; AIE2PS-NEXT: // =>This Inner Loop Header: Depth=1 -; AIE2PS-NEXT: nopa ; nopb ; nops ; lshl r6, r4, r0; nopm ; nopv -; AIE2PS-NEXT: nopx ; mov dj0, r6 -; AIE2PS-NEXT: lda r6, [p1, dj0] +; AIE2PS-NEXT: lda r0, [p1], #4; nopb ; nopxm ; AIE2PS-NEXT: nop ; AIE2PS-NEXT: nop ; AIE2PS-NEXT: jnzd r1, r1, p2 ; AIE2PS-NEXT: nop // Delay Slot 5 ; AIE2PS-NEXT: nop // Delay Slot 4 -; AIE2PS-NEXT: add r4, r4, #1 // Delay Slot 3 -; AIE2PS-NEXT: add r2, r2, r6 // Delay Slot 2 +; AIE2PS-NEXT: nop // Delay Slot 3 +; AIE2PS-NEXT: add r2, r2, r0 // Delay Slot 2 ; AIE2PS-NEXT: st r2, [p0, #0] // Delay Slot 1 ; AIE2PS-NEXT: // %bb.2: // %for.cond.cleanup ; AIE2PS-NEXT: ret lr diff --git a/llvm/test/CodeGen/AIE/hardware-loops/unknown-tc.ll b/llvm/test/CodeGen/AIE/hardware-loops/unknown-tc.ll index 7f8c966db823..c37dc3e5e401 100644 --- a/llvm/test/CodeGen/AIE/hardware-loops/unknown-tc.ll +++ b/llvm/test/CodeGen/AIE/hardware-loops/unknown-tc.ll @@ -4,41 +4,71 @@ ; See https://llvm.org/LICENSE.txt for license information. ; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception ; -; (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates +; (c) Copyright 2023-2026 Advanced Micro Devices, Inc. or its affiliates -; RUN: llc -O2 -mtriple=aie2 --issue-limit=1 %s -o - | FileCheck %s -; RUN: llc -O2 -mtriple=aie2p --issue-limit=1 %s -o - | FileCheck %s +; RUN: llc -O2 -mtriple=aie2 --issue-limit=1 %s -o - | FileCheck %s --check-prefix=AIE2 +; RUN: llc -O2 -mtriple=aie2p --issue-limit=1 %s -o - | FileCheck %s --check-prefix=AIE2P define void @cbz_exit(ptr %in, ptr %res) { -; CHECK-LABEL: cbz_exit: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mova r0, #-1; nopb ; nopxm -; CHECK-NEXT: mova r1, #2 -; CHECK-NEXT: .LBB0_1: // %loop -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add r0, r0, #1 -; CHECK-NEXT: lshl r2, r0, r1 -; CHECK-NEXT: mov dj0, r2 -; CHECK-NEXT: lda r2, [p0, dj0] -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: jnz r2, #.LBB0_1 -; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: nop // Delay Slot 2 -; CHECK-NEXT: nop // Delay Slot 1 -; CHECK-NEXT: // %bb.2: // %exit -; CHECK-NEXT: ret lr -; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: st r0, [p1, #0] // Delay Slot 2 -; CHECK-NEXT: nop // Delay Slot 1 +; AIE2-LABEL: cbz_exit: +; AIE2: // %bb.0: // %entry +; AIE2-NEXT: mova r0, #-1; nopb ; nopxm +; AIE2-NEXT: mova r1, #2 +; AIE2-NEXT: .LBB0_1: // %loop +; AIE2-NEXT: // =>This Inner Loop Header: Depth=1 +; AIE2-NEXT: nopa ; nopb ; add r0, r0, #1 +; AIE2-NEXT: lshl r2, r0, r1 +; AIE2-NEXT: mov dj0, r2 +; AIE2-NEXT: lda r2, [p0, dj0] +; AIE2-NEXT: nop +; AIE2-NEXT: nop +; AIE2-NEXT: nop +; AIE2-NEXT: nop +; AIE2-NEXT: nop +; AIE2-NEXT: nop +; AIE2-NEXT: jnz r2, #.LBB0_1 +; AIE2-NEXT: nop // Delay Slot 5 +; AIE2-NEXT: nop // Delay Slot 4 +; AIE2-NEXT: nop // Delay Slot 3 +; AIE2-NEXT: nop // Delay Slot 2 +; AIE2-NEXT: nop // Delay Slot 1 +; AIE2-NEXT: // %bb.2: // %exit +; AIE2-NEXT: ret lr +; AIE2-NEXT: nop // Delay Slot 5 +; AIE2-NEXT: nop // Delay Slot 4 +; AIE2-NEXT: nop // Delay Slot 3 +; AIE2-NEXT: st r0, [p1, #0] // Delay Slot 2 +; AIE2-NEXT: nop // Delay Slot 1 +; +; AIE2P-LABEL: cbz_exit: +; AIE2P: // %bb.0: // %entry +; AIE2P-NEXT: mova r0, #-1; nopb ; nopxm +; AIE2P-NEXT: mova r1, #2 +; AIE2P-NEXT: .LBB0_1: // %loop +; AIE2P-NEXT: // =>This Inner Loop Header: Depth=1 +; AIE2P-NEXT: nopa ; nopb ; add r0, r0, #1 +; AIE2P-NEXT: lshl r2, r0, r1 +; AIE2P-NEXT: mov dj0, r2 +; AIE2P-NEXT: lda r2, [p0, dj0] +; AIE2P-NEXT: nop +; AIE2P-NEXT: nop +; AIE2P-NEXT: nop +; AIE2P-NEXT: nop +; AIE2P-NEXT: nop +; AIE2P-NEXT: nop +; AIE2P-NEXT: jnz r2, #.LBB0_1 +; AIE2P-NEXT: nop // Delay Slot 5 +; AIE2P-NEXT: nop // Delay Slot 4 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: nop // Delay Slot 2 +; AIE2P-NEXT: nop // Delay Slot 1 +; AIE2P-NEXT: // %bb.2: // %exit +; AIE2P-NEXT: ret lr +; AIE2P-NEXT: nop // Delay Slot 5 +; AIE2P-NEXT: nop // Delay Slot 4 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: st r0, [p1, #0] // Delay Slot 2 +; AIE2P-NEXT: nop // Delay Slot 1 entry: br label %loop @@ -56,35 +86,65 @@ exit: } define void @cbnz_exit(ptr %in, ptr %res) { -; CHECK-LABEL: cbnz_exit: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mova r0, #-1; nopb ; nopxm -; CHECK-NEXT: mova r1, #2 -; CHECK-NEXT: .LBB1_1: // %loop -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add r0, r0, #1 -; CHECK-NEXT: lshl r2, r0, r1 -; CHECK-NEXT: mov dj0, r2 -; CHECK-NEXT: lda r2, [p0, dj0] -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: jz r2, #.LBB1_1 -; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: nop // Delay Slot 2 -; CHECK-NEXT: nop // Delay Slot 1 -; CHECK-NEXT: // %bb.2: // %exit -; CHECK-NEXT: ret lr -; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: st r0, [p1, #0] // Delay Slot 2 -; CHECK-NEXT: nop // Delay Slot 1 +; AIE2-LABEL: cbnz_exit: +; AIE2: // %bb.0: // %entry +; AIE2-NEXT: mova r0, #-1; nopb ; nopxm +; AIE2-NEXT: mova r1, #2 +; AIE2-NEXT: .LBB1_1: // %loop +; AIE2-NEXT: // =>This Inner Loop Header: Depth=1 +; AIE2-NEXT: nopa ; nopb ; add r0, r0, #1 +; AIE2-NEXT: lshl r2, r0, r1 +; AIE2-NEXT: mov dj0, r2 +; AIE2-NEXT: lda r2, [p0, dj0] +; AIE2-NEXT: nop +; AIE2-NEXT: nop +; AIE2-NEXT: nop +; AIE2-NEXT: nop +; AIE2-NEXT: nop +; AIE2-NEXT: nop +; AIE2-NEXT: jz r2, #.LBB1_1 +; AIE2-NEXT: nop // Delay Slot 5 +; AIE2-NEXT: nop // Delay Slot 4 +; AIE2-NEXT: nop // Delay Slot 3 +; AIE2-NEXT: nop // Delay Slot 2 +; AIE2-NEXT: nop // Delay Slot 1 +; AIE2-NEXT: // %bb.2: // %exit +; AIE2-NEXT: ret lr +; AIE2-NEXT: nop // Delay Slot 5 +; AIE2-NEXT: nop // Delay Slot 4 +; AIE2-NEXT: nop // Delay Slot 3 +; AIE2-NEXT: st r0, [p1, #0] // Delay Slot 2 +; AIE2-NEXT: nop // Delay Slot 1 +; +; AIE2P-LABEL: cbnz_exit: +; AIE2P: // %bb.0: // %entry +; AIE2P-NEXT: mova r0, #-1; nopb ; nopxm +; AIE2P-NEXT: mova r1, #2 +; AIE2P-NEXT: .LBB1_1: // %loop +; AIE2P-NEXT: // =>This Inner Loop Header: Depth=1 +; AIE2P-NEXT: nopa ; nopb ; add r0, r0, #1 +; AIE2P-NEXT: lshl r2, r0, r1 +; AIE2P-NEXT: mov dj0, r2 +; AIE2P-NEXT: lda r2, [p0, dj0] +; AIE2P-NEXT: nop +; AIE2P-NEXT: nop +; AIE2P-NEXT: nop +; AIE2P-NEXT: nop +; AIE2P-NEXT: nop +; AIE2P-NEXT: nop +; AIE2P-NEXT: jz r2, #.LBB1_1 +; AIE2P-NEXT: nop // Delay Slot 5 +; AIE2P-NEXT: nop // Delay Slot 4 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: nop // Delay Slot 2 +; AIE2P-NEXT: nop // Delay Slot 1 +; AIE2P-NEXT: // %bb.2: // %exit +; AIE2P-NEXT: ret lr +; AIE2P-NEXT: nop // Delay Slot 5 +; AIE2P-NEXT: nop // Delay Slot 4 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: st r0, [p1, #0] // Delay Slot 2 +; AIE2P-NEXT: nop // Delay Slot 1 entry: br label %loop diff --git a/llvm/test/CodeGen/AIE/hardware-loops/zol-loop.ll b/llvm/test/CodeGen/AIE/hardware-loops/zol-loop.ll index 7c90718c95ac..6fe05c87c3ee 100644 --- a/llvm/test/CodeGen/AIE/hardware-loops/zol-loop.ll +++ b/llvm/test/CodeGen/AIE/hardware-loops/zol-loop.ll @@ -16,9 +16,9 @@ define void @simple_loop(i32 noundef %n, ptr nocapture readonly %in, ptr nocapture writeonly %out) { ; AIE2-LABEL: simple_loop: ; AIE2: // %bb.0: // %entry -; AIE2-NEXT: mova r1, #0 -; AIE2-NEXT: ge r2, r1, r0 -; AIE2-NEXT: jnz r2, #.LBB0_3 +; AIE2-NEXT: mova r1, #0; nopb ; nopx +; AIE2-NEXT: ge r1, r1, r0 +; AIE2-NEXT: jnz r1, #.LBB0_3 ; AIE2-NEXT: nop // Delay Slot 5 ; AIE2-NEXT: nop // Delay Slot 4 ; AIE2-NEXT: nop // Delay Slot 3 @@ -26,20 +26,20 @@ define void @simple_loop(i32 noundef %n, ptr nocapture readonly %in, ptr nocaptu ; AIE2-NEXT: nop // Delay Slot 1 ; AIE2-NEXT: // %bb.1: // %for.body.preheader ; AIE2-NEXT: add.nc lc, r0, #0 -; AIE2-NEXT: mova r2, #1; movxm ls, #.LBB0_2 -; AIE2-NEXT: mova r0, #2; movxm le, #.L_LEnd0 +; AIE2-NEXT: movxm ls, #.LBB0_2 +; AIE2-NEXT: mova r1, #1; movxm le, #.L_LEnd0 ; AIE2-NEXT: .LBB0_2: // %for.body ; AIE2-NEXT: // =>This Inner Loop Header: Depth=1 -; AIE2-NEXT: nopb ; lda r3, [p0, #0]; nops ; nopxm ; nopv +; AIE2-NEXT: nopb ; lda r0, [p0, #0]; nops ; nopxm ; nopv ; AIE2-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; AIE2-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; AIE2-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; AIE2-NEXT: nopb ; nopa ; nops ; nopxm ; nopv -; AIE2-NEXT: nopb ; nopa ; nops ; lshl r4, r1, r0; nopm ; nopv -; AIE2-NEXT: nopa ; nopb ; add r1, r1, #1 -; AIE2-NEXT: add r3, r2, r3; mov dj0, r4 +; AIE2-NEXT: nopb ; nopa ; nops ; nopxm ; nopv +; AIE2-NEXT: nopa ; nopb ; nopxm +; AIE2-NEXT: add r0, r1, r0 ; AIE2-NEXT: .L_LEnd0: -; AIE2-NEXT: nopb ; nopa ; st r3, [p1, dj0]; add r2, r2, #-1; nopm ; nopv +; AIE2-NEXT: nopb ; nopa ; st r0, [p1], #4; add r1, r1, #-1; nopm ; nopv ; AIE2-NEXT: .LBB0_3: // %for.cond.cleanup ; AIE2-NEXT: nopa ; ret lr ; AIE2-NEXT: nop // Delay Slot 5 @@ -50,9 +50,9 @@ define void @simple_loop(i32 noundef %n, ptr nocapture readonly %in, ptr nocaptu ; ; AIE2P-LABEL: simple_loop: ; AIE2P: // %bb.0: // %entry -; AIE2P-NEXT: mova r1, #0 -; AIE2P-NEXT: ge r2, r1, r0 -; AIE2P-NEXT: jnz r2, #.LBB0_3 +; AIE2P-NEXT: mova r1, #0; nopb ; nopx +; AIE2P-NEXT: ge r1, r1, r0 +; AIE2P-NEXT: jnz r1, #.LBB0_3 ; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 ; AIE2P-NEXT: nop // Delay Slot 3 @@ -60,20 +60,20 @@ define void @simple_loop(i32 noundef %n, ptr nocapture readonly %in, ptr nocaptu ; AIE2P-NEXT: nop // Delay Slot 1 ; AIE2P-NEXT: // %bb.1: // %for.body.preheader ; AIE2P-NEXT: add.nc lc, r0, #0 -; AIE2P-NEXT: mova r2, #1; movxm ls, #.LBB0_2 -; AIE2P-NEXT: mova r0, #2; movxm le, #.L_LEnd0 +; AIE2P-NEXT: movxm ls, #.LBB0_2 +; AIE2P-NEXT: mova r1, #1; movxm le, #.L_LEnd0 ; AIE2P-NEXT: .LBB0_2: // %for.body ; AIE2P-NEXT: // =>This Inner Loop Header: Depth=1 -; AIE2P-NEXT: lda r3, [p0, #0]; nopb ; nops ; nopxm ; nopv +; AIE2P-NEXT: lda r0, [p0, #0]; nopb ; nops ; nopxm ; nopv +; AIE2P-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; AIE2P-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; AIE2P-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; AIE2P-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; AIE2P-NEXT: nopa ; nopb ; nops ; nopxm ; nopv -; AIE2P-NEXT: nopa ; nopb ; nops ; lshl r4, r1, r0; nopm ; nopv -; AIE2P-NEXT: nopa ; add r1, r1, #1; nopm -; AIE2P-NEXT: add r3, r2, r3; mov dj0, r4 +; AIE2P-NEXT: nopa ; nopb ; nopxm +; AIE2P-NEXT: add r0, r1, r0 ; AIE2P-NEXT: .L_LEnd0: -; AIE2P-NEXT: nopa ; nopb ; st r3, [p1, dj0]; add r2, r2, #-1; nopm ; nopv +; AIE2P-NEXT: nopa ; nopb ; st r0, [p1], #4; add r1, r1, #-1; nopm ; nopv ; AIE2P-NEXT: .LBB0_3: // %for.cond.cleanup ; AIE2P-NEXT: nopa ; ret lr ; AIE2P-NEXT: nop // Delay Slot 5 @@ -84,9 +84,9 @@ define void @simple_loop(i32 noundef %n, ptr nocapture readonly %in, ptr nocaptu ; ; AIE2PS-LABEL: simple_loop: ; AIE2PS: // %bb.0: // %entry -; AIE2PS-NEXT: mova r2, #0 -; AIE2PS-NEXT: ge r4, r2, r0 -; AIE2PS-NEXT: jnz r4, #.LBB0_3 +; AIE2PS-NEXT: mova r2, #0; nopb ; nopx +; AIE2PS-NEXT: ge r2, r2, r0 +; AIE2PS-NEXT: jnz r2, #.LBB0_3 ; AIE2PS-NEXT: nop // Delay Slot 5 ; AIE2PS-NEXT: nop // Delay Slot 4 ; AIE2PS-NEXT: nop // Delay Slot 3 @@ -94,20 +94,20 @@ define void @simple_loop(i32 noundef %n, ptr nocapture readonly %in, ptr nocaptu ; AIE2PS-NEXT: nop // Delay Slot 1 ; AIE2PS-NEXT: // %bb.1: // %for.body.preheader ; AIE2PS-NEXT: add.nc lc, r0, #0 -; AIE2PS-NEXT: mova r4, #1; movxm ls, #.LBB0_2 -; AIE2PS-NEXT: mova r0, #2; movxm le, #.L_LEnd0 +; AIE2PS-NEXT: movxm ls, #.LBB0_2 +; AIE2PS-NEXT: mova r2, #1; movxm le, #.L_LEnd0 ; AIE2PS-NEXT: .LBB0_2: // %for.body ; AIE2PS-NEXT: // =>This Inner Loop Header: Depth=1 -; AIE2PS-NEXT: lda r6, [p0, #0]; nopb ; nops ; nopxm ; nopv +; AIE2PS-NEXT: lda r0, [p0, #0]; nopb ; nops ; nopxm ; nopv +; AIE2PS-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; AIE2PS-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; AIE2PS-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; AIE2PS-NEXT: nopa ; nopb ; nops ; nopxm ; nopv ; AIE2PS-NEXT: nopa ; nopb ; nops ; nopxm ; nopv -; AIE2PS-NEXT: nopa ; nopb ; nops ; lshl r16, r2, r0; nopm ; nopv -; AIE2PS-NEXT: nopa ; add r2, r2, #1; nopm -; AIE2PS-NEXT: add r6, r4, r6; mov dj0, r16 +; AIE2PS-NEXT: nopa ; nopb ; nopxm +; AIE2PS-NEXT: add r0, r2, r0 ; AIE2PS-NEXT: .L_LEnd0: -; AIE2PS-NEXT: nopa ; nopb ; st r6, [p1, dj0]; add r4, r4, #-1; nopm ; nopv +; AIE2PS-NEXT: nopa ; nopb ; st r0, [p1], #4; add r2, r2, #-1; nopm ; nopv ; AIE2PS-NEXT: .LBB0_3: // %for.cond.cleanup ; AIE2PS-NEXT: nopa ; ret lr ; AIE2PS-NEXT: nop // Delay Slot 5 diff --git a/llvm/test/CodeGen/AIE/opt/lsr-i20-scalar-recurrence.ll b/llvm/test/CodeGen/AIE/opt/lsr-i20-scalar-recurrence.ll new file mode 100644 index 000000000000..2c63728e3767 --- /dev/null +++ b/llvm/test/CodeGen/AIE/opt/lsr-i20-scalar-recurrence.ll @@ -0,0 +1,136 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; +; This file is licensed under the Apache License v2.0 with LLVM Exceptions. +; See https://llvm.org/LICENSE.txt for license information. +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +; +; (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + +; RUN: opt -mtriple=aie2p -passes='print' -disable-output %s 2>&1 | FileCheck %s --check-prefix=IVUSERS +; RUN: opt -mtriple=aie2p -passes=loop-reduce -S %s | FileCheck %s --check-prefix=LSR + +; This test verifies that LSR looks through truncs to collect GEP results as +; IV users on AIE. The pattern is derived from a post_process kernel where: +; - Array indices are computed as trunc(4 * i + offset) to i20 +; - GEPs use these i20 indices with large element types (<32 x float>) +; +; With the IVUsers fix to look through truncs, LSR now: +; 1. Collects GEP results (not trunc results) as IV users +; 2. Gets pointer-typed SCEVs like {%src,+,512} instead of i20 {0,+,4} +; 3. Creates pointer PHIs with byte-indexed GEPs for post-increment + +target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32" +target triple = "aie2p" + +; Check that GEP results (pointers) are collected as IV users +; IVUSERS: IV Users for loop %for.body with backedge-taken count +; IVUSERS: %ptr0 = {%src,+,512}<%for.body> + +; LSR should create pointer PHIs with byte strides + +define void @post_process_pattern(ptr nocapture %src, i32 noundef %len) { +; LSR-LABEL: define void @post_process_pattern( +; LSR-SAME: ptr nocapture [[SRC:%.*]], i32 noundef [[LEN:%.*]]) { +; LSR-NEXT: [[ENTRY:.*]]: +; LSR-NEXT: [[DIV:%.*]] = lshr i32 [[LEN]], 7 +; LSR-NEXT: [[CMP:%.*]] = icmp sgt i32 [[LEN]], 511 +; LSR-NEXT: tail call void @llvm.assume(i1 [[CMP]]) +; LSR-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC]], i20 128 +; LSR-NEXT: [[SCEVGEP6:%.*]] = getelementptr i8, ptr [[SRC]], i20 256 +; LSR-NEXT: br label %[[FOR_BODY:.*]] +; LSR: [[FOR_COND_CLEANUP:.*]]: +; LSR-NEXT: ret void +; LSR: [[FOR_BODY]]: +; LSR-NEXT: [[LSR_IV7:%.*]] = phi ptr [ [[SCEVGEP8:%.*]], %[[FOR_BODY]] ], [ [[SCEVGEP6]], %[[ENTRY]] ] +; LSR-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], %[[FOR_BODY]] ], [ [[SCEVGEP]], %[[ENTRY]] ] +; LSR-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[DIV]], %[[ENTRY]] ] +; LSR-NEXT: [[SCEVGEP10:%.*]] = getelementptr i8, ptr [[LSR_IV7]], i20 -256 +; LSR-NEXT: [[V0:%.*]] = load <32 x float>, ptr [[SCEVGEP10]], align 64 +; LSR-NEXT: [[R0:%.*]] = tail call <32 x bfloat> @llvm.aie2p.v32accfloat.to.v32bf16(<32 x float> [[V0]]) +; LSR-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[LSR_IV1]], i20 -128 +; LSR-NEXT: store <32 x bfloat> [[R0]], ptr [[SCEVGEP3]], align 64 +; LSR-NEXT: [[SCEVGEP11:%.*]] = getelementptr i8, ptr [[LSR_IV7]], i20 -128 +; LSR-NEXT: [[V1:%.*]] = load <32 x float>, ptr [[SCEVGEP11]], align 64 +; LSR-NEXT: [[R1:%.*]] = tail call <32 x bfloat> @llvm.aie2p.v32accfloat.to.v32bf16(<32 x float> [[V1]]) +; LSR-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[LSR_IV1]], i20 -64 +; LSR-NEXT: store <32 x bfloat> [[R1]], ptr [[SCEVGEP5]], align 64 +; LSR-NEXT: [[V2:%.*]] = load <32 x float>, ptr [[LSR_IV7]], align 64 +; LSR-NEXT: [[R2:%.*]] = tail call <32 x bfloat> @llvm.aie2p.v32accfloat.to.v32bf16(<32 x float> [[V2]]) +; LSR-NEXT: store <32 x bfloat> [[R2]], ptr [[LSR_IV1]], align 64 +; LSR-NEXT: [[SCEVGEP9:%.*]] = getelementptr i8, ptr [[LSR_IV7]], i20 128 +; LSR-NEXT: [[V3:%.*]] = load <32 x float>, ptr [[SCEVGEP9]], align 64 +; LSR-NEXT: [[R3:%.*]] = tail call <32 x bfloat> @llvm.aie2p.v32accfloat.to.v32bf16(<32 x float> [[V3]]) +; LSR-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[LSR_IV1]], i20 64 +; LSR-NEXT: store <32 x bfloat> [[R3]], ptr [[SCEVGEP4]], align 64 +; LSR-NEXT: [[LSR_IV_NEXT]] = add nsw i32 [[LSR_IV]], -1 +; LSR-NEXT: [[SCEVGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i20 256 +; LSR-NEXT: [[SCEVGEP8]] = getelementptr i8, ptr [[LSR_IV7]], i20 512 +; LSR-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 0 +; LSR-NEXT: br i1 [[EXITCOND]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; +entry: + %div = lshr i32 %len, 7 + %cmp = icmp sgt i32 %len, 511 + tail call void @llvm.assume(i1 %cmp) + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %i = phi i32 [ 0, %entry ], [ %inc, %for.body ] + + ; Compute base index: 4 * i + %mul = shl nsw i32 %i, 2 + %idx0 = trunc i32 %mul to i20 + + ; Load at offset 0 + %ptr0 = getelementptr inbounds <32 x float>, ptr %src, i20 %idx0 + %v0 = load <32 x float>, ptr %ptr0, align 64 + %r0 = tail call <32 x bfloat> @llvm.aie2p.v32accfloat.to.v32bf16(<32 x float> %v0) + %dst0 = getelementptr inbounds <32 x bfloat>, ptr %src, i20 %idx0 + store <32 x bfloat> %r0, ptr %dst0, align 64 + + ; Load at offset 1 + %add1 = or disjoint i32 %mul, 1 + %idx1 = trunc i32 %add1 to i20 + %ptr1 = getelementptr inbounds <32 x float>, ptr %src, i20 %idx1 + %v1 = load <32 x float>, ptr %ptr1, align 64 + %r1 = tail call <32 x bfloat> @llvm.aie2p.v32accfloat.to.v32bf16(<32 x float> %v1) + %dst1 = getelementptr inbounds <32 x bfloat>, ptr %src, i20 %idx1 + store <32 x bfloat> %r1, ptr %dst1, align 64 + + ; Load at offset 2 + %add2 = or disjoint i32 %mul, 2 + %idx2 = trunc i32 %add2 to i20 + %ptr2 = getelementptr inbounds <32 x float>, ptr %src, i20 %idx2 + %v2 = load <32 x float>, ptr %ptr2, align 64 + %r2 = tail call <32 x bfloat> @llvm.aie2p.v32accfloat.to.v32bf16(<32 x float> %v2) + %dst2 = getelementptr inbounds <32 x bfloat>, ptr %src, i20 %idx2 + store <32 x bfloat> %r2, ptr %dst2, align 64 + + ; Load at offset 3 + %add3 = or disjoint i32 %mul, 3 + %idx3 = trunc i32 %add3 to i20 + %ptr3 = getelementptr inbounds <32 x float>, ptr %src, i20 %idx3 + %v3 = load <32 x float>, ptr %ptr3, align 64 + %r3 = tail call <32 x bfloat> @llvm.aie2p.v32accfloat.to.v32bf16(<32 x float> %v3) + %dst3 = getelementptr inbounds <32 x bfloat>, ptr %src, i20 %idx3 + store <32 x bfloat> %r3, ptr %dst3, align 64 + + %inc = add nuw nsw i32 %i, 1 + %exitcond = icmp eq i32 %inc, %div + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !0 +} + +declare <32 x bfloat> @llvm.aie2p.v32accfloat.to.v32bf16(<32 x float>) +declare void @llvm.assume(i1 noundef) + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.itercount.range", i64 4} +;. +; LSR: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]} +; LSR: [[META1]] = !{!"llvm.loop.itercount.range", i64 4} +;. +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; IVUSERS: {{.*}} diff --git a/llvm/test/CodeGen/AIE/opt/lsr-nested-loop-non-dominating.ll b/llvm/test/CodeGen/AIE/opt/lsr-nested-loop-non-dominating.ll new file mode 100644 index 000000000000..6fef1abde0b9 --- /dev/null +++ b/llvm/test/CodeGen/AIE/opt/lsr-nested-loop-non-dominating.ll @@ -0,0 +1,193 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; +; This file is licensed under the Apache License v2.0 with LLVM Exceptions. +; See https://llvm.org/LICENSE.txt for license information. +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +; +; (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + +; RUN: opt -mtriple=aie2p -passes=loop-reduce -S %s | FileCheck %s + +; This test demonstrates a regression pattern where LSR creates pointer PHIs +; in the outer loop header for GEPs that are only used in the inner loop. +; +; Problem: The inner loop GEPs don't dominate the outer loop latch, but LSR +; creates pointer recurrences for them anyway. This causes: +; 1. Multiple pointer PHIs in outer loop header +; 2. Expensive padda/paddb/padds updates in outer loop latch +; 3. Unconditional pointer updates even when inner loop wasn't entered +; +; Expected: LSR should use scalar index recurrence + indexed addressing, +; keeping pointer computation where it's actually used. +; +; Reference (good): vldb x8, [p3, dj1] with add r27, r27, r24 in latch +; Regressed (bad): vldb x8, [p6], #64 with padda [p3], m1; padda [p7], m1 in latch + +target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32" +target triple = "aie2p" + +; Test: Nested loop where inner loop GEPs should NOT create outer loop pointer PHIs +; +; The inner loop should use scalar i20 recurrence + indexed addressing, +; not pointer recurrence which would require expensive updates in outer latch. +; +define void @nested_loop_non_dominating(ptr %base, i32 %outer_n, i32 %inner_n, i32 %stride) { +; CHECK-LABEL: define void @nested_loop_non_dominating( +; CHECK-SAME: ptr [[BASE:%.*]], i32 [[OUTER_N:%.*]], i32 [[INNER_N:%.*]], i32 [[STRIDE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[STRIDE20:%.*]] = trunc i32 [[STRIDE]] to i20 +; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[STRIDE]] to i20 +; CHECK-NEXT: br label %[[OUTER_HEADER:.*]] +; CHECK: [[OUTER_HEADER]]: +; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[OUTER_LATCH:.*]] ], [ [[BASE]], %[[ENTRY]] ] +; CHECK-NEXT: [[OUTER_I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[OUTER_NEXT:%.*]], %[[OUTER_LATCH]] ] +; CHECK-NEXT: [[OUTER_CMP:%.*]] = icmp slt i32 [[OUTER_I]], [[OUTER_N]] +; CHECK-NEXT: br i1 [[OUTER_CMP]], label %[[INNER_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK: [[INNER_PREHEADER]]: +; CHECK-NEXT: br label %[[INNER_HEADER:.*]] +; CHECK: [[INNER_HEADER]]: +; CHECK-NEXT: [[INNER_PTR:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], %[[INNER_HEADER]] ], [ [[LSR_IV]], %[[INNER_PREHEADER]] ] +; CHECK-NEXT: [[INNER_I:%.*]] = phi i32 [ 0, %[[INNER_PREHEADER]] ], [ [[INNER_NEXT:%.*]], %[[INNER_HEADER]] ] +; CHECK-NEXT: [[VAL:%.*]] = load <32 x i16>, ptr [[INNER_PTR]], align 64 +; CHECK-NEXT: call void @consume(<32 x i16> [[VAL]]) +; CHECK-NEXT: [[INNER_NEXT]] = add i32 [[INNER_I]], 1 +; CHECK-NEXT: [[SCEVGEP2]] = getelementptr i8, ptr [[INNER_PTR]], i20 64 +; CHECK-NEXT: [[INNER_CMP:%.*]] = icmp slt i32 [[INNER_NEXT]], [[INNER_N]] +; CHECK-NEXT: br i1 [[INNER_CMP]], label %[[INNER_HEADER]], label %[[OUTER_LATCH]] +; CHECK: [[OUTER_LATCH]]: +; CHECK-NEXT: [[OUTER_NEXT]] = add i32 [[OUTER_I]], 1 +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV]], i20 [[TMP0]] +; CHECK-NEXT: br label %[[OUTER_HEADER]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + %stride20 = trunc i32 %stride to i20 + br label %outer_header + +outer_header: + %outer_i = phi i32 [ 0, %entry ], [ %outer_next, %outer_latch ] + %outer_cmp = icmp slt i32 %outer_i, %outer_n + br i1 %outer_cmp, label %inner_preheader, label %exit + +inner_preheader: + ; Compute base offset for this outer iteration + %outer_offset_32 = mul i32 %outer_i, %stride + %outer_offset = trunc i32 %outer_offset_32 to i20 + %outer_ptr = getelementptr i8, ptr %base, i20 %outer_offset + br label %inner_header + +inner_header: + %inner_i = phi i32 [ 0, %inner_preheader ], [ %inner_next, %inner_header ] + ; This GEP is inside inner loop - should NOT create pointer PHI in outer_header + %inner_offset_32 = mul i32 %inner_i, 64 + %inner_offset = trunc i32 %inner_offset_32 to i20 + %inner_ptr = getelementptr i8, ptr %outer_ptr, i20 %inner_offset + + %val = load <32 x i16>, ptr %inner_ptr, align 64 + call void @consume(<32 x i16> %val) + + %inner_next = add i32 %inner_i, 1 + %inner_cmp = icmp slt i32 %inner_next, %inner_n + br i1 %inner_cmp, label %inner_header, label %outer_latch + +outer_latch: + ; Only scalar add should happen here, NOT pointer updates + %outer_next = add i32 %outer_i, 1 + br label %outer_header + +exit: + ret void +} + +; Test: Multiple arrays in nested loop - even worse regression +; Each array creates its own pointer PHI, multiplying outer latch cost +; +; Outer loop should have scalar i20 PHI, not pointer PHIs +; Inner loop should use scalar i20 recurrence +; Outer latch should have scalar add, not multiple pointer scevgeps +define void @nested_loop_multiple_arrays(ptr %a, ptr %b, ptr %c, i32 %outer_n, i32 %inner_n, i32 %stride) { +; CHECK-LABEL: define void @nested_loop_multiple_arrays( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[OUTER_N:%.*]], i32 [[INNER_N:%.*]], i32 [[STRIDE:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[STRIDE20:%.*]] = trunc i32 [[STRIDE]] to i20 +; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[STRIDE]] to i20 +; CHECK-NEXT: br label %[[OUTER_HEADER:.*]] +; CHECK: [[OUTER_HEADER]]: +; CHECK-NEXT: [[LSR_IV7:%.*]] = phi ptr [ [[SCEVGEP8:%.*]], %[[OUTER_LATCH:.*]] ], [ [[A]], %[[ENTRY]] ] +; CHECK-NEXT: [[LSR_IV3:%.*]] = phi ptr [ [[SCEVGEP4:%.*]], %[[OUTER_LATCH]] ], [ [[B]], %[[ENTRY]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[OUTER_LATCH]] ], [ [[C]], %[[ENTRY]] ] +; CHECK-NEXT: [[OUTER_I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[OUTER_NEXT:%.*]], %[[OUTER_LATCH]] ] +; CHECK-NEXT: [[OUTER_CMP:%.*]] = icmp slt i32 [[OUTER_I]], [[OUTER_N]] +; CHECK-NEXT: br i1 [[OUTER_CMP]], label %[[INNER_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK: [[INNER_PREHEADER]]: +; CHECK-NEXT: br label %[[INNER_HEADER:.*]] +; CHECK: [[INNER_HEADER]]: +; CHECK-NEXT: [[INNER_A:%.*]] = phi ptr [ [[SCEVGEP10:%.*]], %[[INNER_HEADER]] ], [ [[LSR_IV7]], %[[INNER_PREHEADER]] ] +; CHECK-NEXT: [[INNER_B:%.*]] = phi ptr [ [[SCEVGEP6:%.*]], %[[INNER_HEADER]] ], [ [[LSR_IV3]], %[[INNER_PREHEADER]] ] +; CHECK-NEXT: [[INNER_C:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], %[[INNER_HEADER]] ], [ [[LSR_IV]], %[[INNER_PREHEADER]] ] +; CHECK-NEXT: [[INNER_I:%.*]] = phi i32 [ 0, %[[INNER_PREHEADER]] ], [ [[INNER_NEXT:%.*]], %[[INNER_HEADER]] ] +; CHECK-NEXT: [[VAL_A:%.*]] = load <32 x i16>, ptr [[INNER_A]], align 64 +; CHECK-NEXT: [[VAL_B:%.*]] = load <32 x i16>, ptr [[INNER_B]], align 64 +; CHECK-NEXT: [[SUM:%.*]] = add <32 x i16> [[VAL_A]], [[VAL_B]] +; CHECK-NEXT: store <32 x i16> [[SUM]], ptr [[INNER_C]], align 64 +; CHECK-NEXT: [[INNER_NEXT]] = add i32 [[INNER_I]], 1 +; CHECK-NEXT: [[SCEVGEP2]] = getelementptr i8, ptr [[INNER_C]], i20 64 +; CHECK-NEXT: [[SCEVGEP6]] = getelementptr i8, ptr [[INNER_B]], i20 64 +; CHECK-NEXT: [[SCEVGEP10]] = getelementptr i8, ptr [[INNER_A]], i20 64 +; CHECK-NEXT: [[INNER_CMP:%.*]] = icmp slt i32 [[INNER_NEXT]], [[INNER_N]] +; CHECK-NEXT: br i1 [[INNER_CMP]], label %[[INNER_HEADER]], label %[[OUTER_LATCH]] +; CHECK: [[OUTER_LATCH]]: +; CHECK-NEXT: [[OUTER_NEXT]] = add i32 [[OUTER_I]], 1 +; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV]], i20 [[TMP0]] +; CHECK-NEXT: [[SCEVGEP4]] = getelementptr i8, ptr [[LSR_IV3]], i20 [[TMP0]] +; CHECK-NEXT: [[SCEVGEP8]] = getelementptr i8, ptr [[LSR_IV7]], i20 [[TMP0]] +; CHECK-NEXT: br label %[[OUTER_HEADER]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + %stride20 = trunc i32 %stride to i20 + br label %outer_header + +outer_header: + %outer_i = phi i32 [ 0, %entry ], [ %outer_next, %outer_latch ] + %outer_cmp = icmp slt i32 %outer_i, %outer_n + br i1 %outer_cmp, label %inner_preheader, label %exit + +inner_preheader: + %outer_offset_32 = mul i32 %outer_i, %stride + %outer_offset = trunc i32 %outer_offset_32 to i20 + %ptr_a = getelementptr i8, ptr %a, i20 %outer_offset + %ptr_b = getelementptr i8, ptr %b, i20 %outer_offset + %ptr_c = getelementptr i8, ptr %c, i20 %outer_offset + br label %inner_header + +inner_header: + %inner_i = phi i32 [ 0, %inner_preheader ], [ %inner_next, %inner_header ] + %inner_offset_32 = mul i32 %inner_i, 64 + %inner_offset = trunc i32 %inner_offset_32 to i20 + + ; Three GEPs - should NOT create 3 pointer PHIs in outer_header + %inner_a = getelementptr i8, ptr %ptr_a, i20 %inner_offset + %inner_b = getelementptr i8, ptr %ptr_b, i20 %inner_offset + %inner_c = getelementptr i8, ptr %ptr_c, i20 %inner_offset + + %val_a = load <32 x i16>, ptr %inner_a, align 64 + %val_b = load <32 x i16>, ptr %inner_b, align 64 + %sum = add <32 x i16> %val_a, %val_b + store <32 x i16> %sum, ptr %inner_c, align 64 + + %inner_next = add i32 %inner_i, 1 + %inner_cmp = icmp slt i32 %inner_next, %inner_n + br i1 %inner_cmp, label %inner_header, label %outer_latch + +outer_latch: + %outer_next = add i32 %outer_i, 1 + br label %outer_header + +exit: + ret void +} + +declare void @consume(<32 x i16>) diff --git a/llvm/test/CodeGen/AIE/opt/lsr-preserve-pointer-recurrence.ll b/llvm/test/CodeGen/AIE/opt/lsr-preserve-pointer-recurrence.ll new file mode 100644 index 000000000000..c907155256a7 --- /dev/null +++ b/llvm/test/CodeGen/AIE/opt/lsr-preserve-pointer-recurrence.ll @@ -0,0 +1,171 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; +; This file is licensed under the Apache License v2.0 with LLVM Exceptions. +; See https://llvm.org/LICENSE.txt for license information. +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +; +; (c) Copyright 2026 Advanced Micro Devices, Inc. or its affiliates + +; RUN: opt -mtriple=aie2p -passes=loop-reduce -S %s | FileCheck %s + +; This test verifies that LSR preserves pointer recurrences on AIE targets. +; AIE processors support post-increment addressing modes (VLD_pstm, VST_pstm) +; that fold pointer updates into memory operations for free. LSR should NOT +; rewrite pointer PHIs to scalar offset + base formulas, as this would +; prevent post-increment combining and introduce extra PADD instructions. +; +; Specifically, this test checks that: +; 1. Pointer PHIs are preserved (not rewritten to %scevgep or similar) +; 2. GEP chains retain their original structure with inbounds +; 3. addrspacecast operations don't trigger unwanted IV chain processing + +target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32" +target triple = "aie2p" + +; Test: Multiple pointer recurrences with variable stride through addrspacecast +; The pointer PHIs should be preserved as-is, not rewritten by LSR. +; +define void @multi_pointer_addrspacecast(ptr %ifm, ptr %ofm, i20 %stride, i32 %n) { +; CHECK-LABEL: define void @multi_pointer_addrspacecast( +; CHECK-SAME: ptr [[IFM:%.*]], ptr [[OFM:%.*]], i20 [[STRIDE:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[P_IFM:%.*]] = phi ptr [ [[IFM]], %[[ENTRY]] ], [ [[NEXT_IFM:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[P_OFM:%.*]] = phi ptr [ [[OFM]], %[[ENTRY]] ], [ [[NEXT_OFM:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IFM_AS:%.*]] = addrspacecast ptr [[P_IFM]] to ptr addrspace(5) +; CHECK-NEXT: [[OFM_AS:%.*]] = addrspacecast ptr [[P_OFM]] to ptr addrspace(7) +; CHECK-NEXT: [[VAL:%.*]] = load <16 x i32>, ptr addrspace(5) [[IFM_AS]], align 64 +; CHECK-NEXT: store <16 x i32> [[VAL]], ptr addrspace(7) [[OFM_AS]], align 64 +; CHECK-NEXT: [[NEXT_IFM]] = getelementptr inbounds i8, ptr [[P_IFM]], i20 [[STRIDE]] +; CHECK-NEXT: [[NEXT_OFM]] = getelementptr inbounds i8, ptr [[P_OFM]], i20 [[STRIDE]] +; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i32 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %p_ifm = phi ptr [ %ifm, %entry ], [ %next_ifm, %loop ] + %p_ofm = phi ptr [ %ofm, %entry ], [ %next_ofm, %loop ] + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + + %ifm_as = addrspacecast ptr %p_ifm to ptr addrspace(5) + %ofm_as = addrspacecast ptr %p_ofm to ptr addrspace(7) + + %val = load <16 x i32>, ptr addrspace(5) %ifm_as, align 64 + store <16 x i32> %val, ptr addrspace(7) %ofm_as, align 64 + + %next_ifm = getelementptr inbounds i8, ptr %p_ifm, i20 %stride + %next_ofm = getelementptr inbounds i8, ptr %p_ofm, i20 %stride + + %i.next = add i32 %i, 1 + %cond = icmp slt i32 %i.next, %n + br i1 %cond, label %loop, label %exit + +exit: + ret void +} + +; Test: GEP chain within a loop (multiple loads at offsets) +; This pattern can form IV chains in LSR. LSR should NOT rewrite these. +; +define void @gep_chain_pattern(ptr %base, i20 %stride, i32 %n) { +; CHECK-LABEL: define void @gep_chain_pattern( +; CHECK-SAME: ptr [[BASE:%.*]], i20 [[STRIDE:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[P:%.*]] = phi ptr [ [[BASE]], %[[ENTRY]] ], [ [[P3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[AS0:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(5) +; CHECK-NEXT: [[V0:%.*]] = load <16 x i32>, ptr addrspace(5) [[AS0]], align 64 +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i20 [[STRIDE]] +; CHECK-NEXT: [[AS1:%.*]] = addrspacecast ptr [[P1]] to ptr addrspace(5) +; CHECK-NEXT: [[V1:%.*]] = load <16 x i32>, ptr addrspace(5) [[AS1]], align 64 +; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i20 [[STRIDE]] +; CHECK-NEXT: [[AS2:%.*]] = addrspacecast ptr [[P2]] to ptr addrspace(5) +; CHECK-NEXT: [[V2:%.*]] = load <16 x i32>, ptr addrspace(5) [[AS2]], align 64 +; CHECK-NEXT: [[P3]] = getelementptr inbounds i8, ptr [[P2]], i20 [[STRIDE]] +; CHECK-NEXT: call void @consume(<16 x i32> [[V0]]) +; CHECK-NEXT: call void @consume(<16 x i32> [[V1]]) +; CHECK-NEXT: call void @consume(<16 x i32> [[V2]]) +; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i32 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %p = phi ptr [ %base, %entry ], [ %p3, %loop ] + %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] + + %as0 = addrspacecast ptr %p to ptr addrspace(5) + %v0 = load <16 x i32>, ptr addrspace(5) %as0, align 64 + + %p1 = getelementptr inbounds i8, ptr %p, i20 %stride + %as1 = addrspacecast ptr %p1 to ptr addrspace(5) + %v1 = load <16 x i32>, ptr addrspace(5) %as1, align 64 + + %p2 = getelementptr inbounds i8, ptr %p1, i20 %stride + %as2 = addrspacecast ptr %p2 to ptr addrspace(5) + %v2 = load <16 x i32>, ptr addrspace(5) %as2, align 64 + + %p3 = getelementptr inbounds i8, ptr %p2, i20 %stride + + call void @consume(<16 x i32> %v0) + call void @consume(<16 x i32> %v1) + call void @consume(<16 x i32> %v2) + + %i.next = add i32 %i, 1 + %cond = icmp slt i32 %i.next, %n + br i1 %cond, label %loop, label %exit + +exit: + ret void +} + +declare void @consume(<16 x i32>) + +; Test: i20 scalar recurrences should still be handled by LSR +; This ensures the IVUsers change doesn't break i20 integer optimization +; +define i20 @i20_scalar_recurrence(i20 %n, i20 %step) { +; CHECK-LABEL: define i20 @i20_scalar_recurrence( +; CHECK-SAME: i20 [[N:%.*]], i20 [[STEP:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[I:%.*]] = phi i20 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[SUM:%.*]] = phi i20 [ 0, %[[ENTRY]] ], [ [[SUM_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[SUM_NEXT]] = add i20 [[SUM]], [[I]] +; CHECK-NEXT: [[I_NEXT]] = add i20 [[I]], [[STEP]] +; CHECK-NEXT: [[COND:%.*]] = icmp slt i20 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i20 [ [[SUM_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: ret i20 [[SUM_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %i = phi i20 [ 0, %entry ], [ %i.next, %loop ] + %sum = phi i20 [ 0, %entry ], [ %sum.next, %loop ] + + %sum.next = add i20 %sum, %i + %i.next = add i20 %i, %step + + %cond = icmp slt i20 %i.next, %n + br i1 %cond, label %loop, label %exit + +exit: + ret i20 %sum.next +}