diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 4835c66a7a3bc..6d355dee74bfb 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1673,9 +1673,10 @@ class TargetTransformInfo { /// \returns A value which is the result of the given memory intrinsic. New /// instructions may be created to extract the result from the given intrinsic /// memory operation. Returns nullptr if the target cannot create a result - /// from the given intrinsic. - Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, - Type *ExpectedType) const; + /// from the given intrinsic. Adds newly created instructions to \p NewInsts. + Value *getOrCreateResultFromMemIntrinsic( + IntrinsicInst *Inst, Type *ExpectedType, + SmallVectorImpl &NewInsts) const; /// \returns The type to use in a loop expansion of a memcpy call. Type *getMemcpyLoopLoweringType( @@ -2290,8 +2291,9 @@ class TargetTransformInfo::Concept { virtual bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) = 0; virtual unsigned getAtomicMemIntrinsicMaxElementSize() const = 0; - virtual Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, - Type *ExpectedType) = 0; + virtual Value *getOrCreateResultFromMemIntrinsic( + IntrinsicInst *Inst, Type *ExpectedType, + SmallVectorImpl &NewInsts) = 0; virtual Type *getMemcpyLoopLoweringType( LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, @@ -3065,9 +3067,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { unsigned getAtomicMemIntrinsicMaxElementSize() const override { return Impl.getAtomicMemIntrinsicMaxElementSize(); } - Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, - Type *ExpectedType) override { - return Impl.getOrCreateResultFromMemIntrinsic(Inst, ExpectedType); + Value *getOrCreateResultFromMemIntrinsic( + IntrinsicInst *Inst, Type *ExpectedType, + SmallVectorImpl &NewInsts) override { + return Impl.getOrCreateResultFromMemIntrinsic(Inst, ExpectedType, NewInsts); } Type *getMemcpyLoopLoweringType( LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 261d5eacc91b0..a111d6903b6eb 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -917,8 +917,9 @@ class TargetTransformInfoImplBase { return 0; } - Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, - Type *ExpectedType) const { + Value *getOrCreateResultFromMemIntrinsic( + IntrinsicInst *Inst, Type *ExpectedType, + SmallVectorImpl &NewInsts) const { return nullptr; } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index e3212135e9b19..6e75b6831cd9a 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1278,8 +1278,10 @@ unsigned TargetTransformInfo::getAtomicMemIntrinsicMaxElementSize() const { } Value *TargetTransformInfo::getOrCreateResultFromMemIntrinsic( - IntrinsicInst *Inst, Type *ExpectedType) const { - return TTIImpl->getOrCreateResultFromMemIntrinsic(Inst, ExpectedType); + IntrinsicInst *Inst, Type *ExpectedType, + SmallVectorImpl &NewInsts) const { + return TTIImpl->getOrCreateResultFromMemIntrinsic(Inst, ExpectedType, + NewInsts); } Type *TargetTransformInfo::getMemcpyLoopLoweringType( diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 77be41b78bc7f..d55aa9ba24718 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -4743,8 +4743,9 @@ void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, BaseT::getPeelingPreferences(L, SE, PP); } -Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, - Type *ExpectedType) { +Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic( + IntrinsicInst *Inst, Type *ExpectedType, + SmallVectorImpl &NewInsts) { switch (Inst->getIntrinsicID()) { default: return nullptr; @@ -4763,7 +4764,11 @@ Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, return nullptr; } Value *Res = PoisonValue::get(ExpectedType); - IRBuilder<> Builder(Inst); + IRBuilder Builder( + Inst->getContext(), ConstantFolder(), + IRBuilderCallbackInserter( + [&NewInsts](Instruction *I) { NewInsts.push_back(I); })); + Builder.SetInsertPoint(Inst); for (unsigned i = 0, e = NumElts; i != e; ++i) { Value *L = Inst->getArgOperand(i); Res = Builder.CreateInsertValue(Res, L, i); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index ae0df6b895ec8..85e0e6dbddbe5 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -256,8 +256,9 @@ class AArch64TTIImpl : public BasicTTIImplBase { void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP); - Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, - Type *ExpectedType); + Value * + getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType, + SmallVectorImpl &NewInsts); bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info); diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index 3a0ae6b01a114..7d8b3a7073da5 100644 --- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -729,6 +729,8 @@ class EarlyCSE { /// This is the current generation of the memory value. unsigned CurrentGeneration = 0; + SmallVector TmpInstructions; + /// Set up the EarlyCSE runner for a particular function. EarlyCSE(const DataLayout &DL, const TargetLibraryInfo &TLI, const TargetTransformInfo &TTI, DominatorTree &DT, @@ -965,7 +967,8 @@ class EarlyCSE { bool overridingStores(const ParseMemoryInst &Earlier, const ParseMemoryInst &Later); - Value *getOrCreateResult(Instruction *Inst, Type *ExpectedType) const { + Value *getOrCreateResult(Instruction *Inst, Type *ExpectedType, + SmallVectorImpl &TmpInsts) const { // TODO: We could insert relevant casts on type mismatch. // The load or the store's first operand. Value *V; @@ -978,7 +981,8 @@ class EarlyCSE { V = II->getOperand(0); break; default: - return TTI.getOrCreateResultFromMemIntrinsic(II, ExpectedType); + return TTI.getOrCreateResultFromMemIntrinsic(II, ExpectedType, + TmpInsts); } } else { V = isa(Inst) ? Inst : cast(Inst)->getValueOperand(); @@ -1262,9 +1266,10 @@ Value *EarlyCSE::getMatchingValue(LoadValue &InVal, ParseMemoryInst &MemInst, // For stores check the result values before checking memory generation // (otherwise isSameMemGeneration may crash). - Value *Result = MemInst.isStore() - ? getOrCreateResult(Matching, Other->getType()) - : nullptr; + Value *Result = + MemInst.isStore() + ? getOrCreateResult(Matching, Other->getType(), TmpInstructions) + : nullptr; if (MemInst.isStore() && InVal.DefInst != Result) return nullptr; @@ -1285,7 +1290,7 @@ Value *EarlyCSE::getMatchingValue(LoadValue &InVal, ParseMemoryInst &MemInst, return nullptr; if (!Result) - Result = getOrCreateResult(Matching, Other->getType()); + Result = getOrCreateResult(Matching, Other->getType(), TmpInstructions); return Result; } @@ -1833,6 +1838,11 @@ bool EarlyCSE::run() { } } // while (!nodes...) + // Clean up temporary instructions. + for (Instruction *I : reverse(TmpInstructions)) + if (I->use_empty()) + I->eraseFromParent(); + return Changed; } diff --git a/llvm/test/Transforms/EarlyCSE/AArch64/intrinsics.ll b/llvm/test/Transforms/EarlyCSE/AArch64/intrinsics.ll index 94b17510bb95d..4744e3761fa6b 100644 --- a/llvm/test/Transforms/EarlyCSE/AArch64/intrinsics.ll +++ b/llvm/test/Transforms/EarlyCSE/AArch64/intrinsics.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -passes=early-cse -earlycse-debug-hash | FileCheck %s -; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -aa-pipeline=basic-aa -passes='early-cse' | FileCheck %s +; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -aa-pipeline=basic-aa -passes='early-cse' -verify-analysis-invalidation | FileCheck %s define <4 x i32> @test_cse(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) { ; CHECK-LABEL: define <4 x i32> @test_cse( @@ -17,8 +17,6 @@ define <4 x i32> @test_cse(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) { ; CHECK: [[FOR_BODY]]: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[S_COERCE_FCA_0_EXTRACT]] to <16 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[S_COERCE_FCA_1_EXTRACT]] to <16 x i8> -; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> [[S_COERCE_FCA_0_EXTRACT]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { <4 x i32>, <4 x i32> } [[TMP2]], <4 x i32> [[S_COERCE_FCA_1_EXTRACT]], 1 ; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[S_COERCE_FCA_0_EXTRACT]], <4 x i32> [[S_COERCE_FCA_1_EXTRACT]], ptr [[A]]) ; CHECK-NEXT: [[CALL]] = call <4 x i32> @vaddq_s32(<4 x i32> [[S_COERCE_FCA_0_EXTRACT]], <4 x i32> [[S_COERCE_FCA_0_EXTRACT]]) ; CHECK-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 @@ -71,8 +69,6 @@ define <4 x i32> @test_cse2(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[S_COERCE_FCA_0_EXTRACT]] to <16 x i8> ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[S_COERCE_FCA_1_EXTRACT]] to <16 x i8> ; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[S_COERCE_FCA_0_EXTRACT]], <4 x i32> [[S_COERCE_FCA_0_EXTRACT]], ptr [[A]]) -; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> [[S_COERCE_FCA_0_EXTRACT]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { <4 x i32>, <4 x i32> } [[TMP2]], <4 x i32> [[S_COERCE_FCA_1_EXTRACT]], 1 ; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[S_COERCE_FCA_0_EXTRACT]], <4 x i32> [[S_COERCE_FCA_1_EXTRACT]], ptr [[A]]) ; CHECK-NEXT: [[CALL]] = call <4 x i32> @vaddq_s32(<4 x i32> [[S_COERCE_FCA_0_EXTRACT]], <4 x i32> [[S_COERCE_FCA_0_EXTRACT]]) ; CHECK-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 @@ -324,6 +320,22 @@ for.end: ; preds = %for.cond ret <4 x i32> %res.0 } +define void @test_ld4_st4_no_cse(ptr %p, <16 x i8> %A, <16 x i8> %B) { +; CHECK-LABEL: define void @test_ld4_st4_no_cse( +; CHECK-SAME: ptr [[P:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[LD:%.*]] = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr [[P]]) +; CHECK-NEXT: [[EXT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[LD]], 0 +; CHECK-NEXT: tail call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[EXT]], <16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> zeroinitializer, ptr [[P]]) +; CHECK-NEXT: ret void +; +entry: + %ld = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr %p) + %ext = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld, 0 + tail call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> %ext, <16 x i8> %A, <16 x i8> %B, <16 x i8> zeroinitializer, ptr %p) + ret void +} + ; Function Attrs: nounwind declare void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32>, <4 x i32>, ptr nocapture)