diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 06a652c146fb9..fb865ac9f453c 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -4899,6 +4899,238 @@ void CGOpenMPRuntime::emitSingleReductionCombiner(CodeGenFunction &CGF,
}
}
+void CGOpenMPRuntime::emitPrivateReduction(
+ CodeGenFunction &CGF, SourceLocation Loc, ArrayRef Privates,
+ ArrayRef LHSExprs, ArrayRef RHSExprs,
+ ArrayRef ReductionOps) {
+ if (LHSExprs.empty() || Privates.empty() || ReductionOps.empty())
+ return;
+
+ if (LHSExprs.size() != Privates.size() ||
+ LHSExprs.size() != ReductionOps.size())
+ return;
+
+ QualType PrivateType = Privates[0]->getType();
+ llvm::Type *LLVMType = CGF.ConvertTypeForMem(PrivateType);
+
+ llvm::Constant *InitVal = nullptr;
+ const OMPDeclareReductionDecl *UDR = getReductionInit(ReductionOps[0]);
+
+ if (!UDR) {
+ InitVal = llvm::Constant::getNullValue(LLVMType);
+ if (const auto *DRE = dyn_cast(Privates[0])) {
+ if (const auto *VD = dyn_cast(DRE->getDecl())) {
+ const Expr *InitExpr = VD->getInit();
+ if (InitExpr && !PrivateType->isAggregateType() &&
+ !PrivateType->isAnyComplexType()) {
+ Expr::EvalResult Result;
+ if (InitExpr->EvaluateAsRValue(Result, CGF.getContext())) {
+ APValue &InitValue = Result.Val;
+ if (InitValue.isInt()) {
+ InitVal = llvm::ConstantInt::get(LLVMType, InitValue.getInt());
+ }
+ }
+ }
+ }
+ }
+ } else {
+ InitVal = llvm::Constant::getNullValue(LLVMType);
+ }
+
+ // Create an internal shared variable
+ std::string SharedName = getName({"internal_private_var"});
+ llvm::GlobalVariable *SharedVar = new llvm::GlobalVariable(
+ CGM.getModule(), LLVMType, false, llvm::GlobalValue::CommonLinkage,
+ InitVal, ".omp.reduction." + SharedName, nullptr,
+ llvm::GlobalVariable::NotThreadLocal);
+
+ SharedVar->setAlignment(
+ llvm::MaybeAlign(CGF.getContext().getTypeAlign(PrivateType) / 8));
+
+ Address SharedResult(SharedVar, SharedVar->getValueType(),
+ CGF.getContext().getTypeAlignInChars(PrivateType));
+
+ llvm::Value *ThreadId = getThreadID(CGF, Loc);
+ llvm::Value *BarrierLoc = emitUpdateLocation(CGF, Loc, OMP_ATOMIC_REDUCE);
+ llvm::Value *BarrierArgs[] = {BarrierLoc, ThreadId};
+
+ CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), OMPRTL___kmpc_barrier),
+ BarrierArgs);
+
+ llvm::BasicBlock *InitBB = CGF.createBasicBlock("init");
+ llvm::BasicBlock *InitEndBB = CGF.createBasicBlock("init.end");
+
+ llvm::Value *IsWorker = CGF.Builder.CreateICmpEQ(
+ ThreadId, llvm::ConstantInt::get(ThreadId->getType(), 0));
+ CGF.Builder.CreateCondBr(IsWorker, InitBB, InitEndBB);
+
+ CGF.EmitBlock(InitBB);
+
+ if (const auto *DRE = dyn_cast(Privates[0])) {
+ if (const auto *VD = dyn_cast(DRE->getDecl())) {
+ const Expr *InitExpr = VD->getInit();
+ if (InitExpr &&
+ (PrivateType->isAggregateType() || PrivateType->isAnyComplexType())) {
+ CGF.EmitAnyExprToMem(InitExpr, SharedResult,
+ PrivateType.getQualifiers(), true);
+ } else if (!InitVal->isNullValue()) {
+ CGF.EmitStoreOfScalar(InitVal,
+ CGF.MakeAddrLValue(SharedResult, PrivateType));
+ } else {
+ CGF.EmitNullInitialization(SharedResult, PrivateType);
+ }
+ } else {
+ CGF.EmitNullInitialization(SharedResult, PrivateType);
+ }
+ } else {
+ CGF.EmitNullInitialization(SharedResult, PrivateType);
+ }
+
+ CGF.Builder.CreateBr(InitEndBB);
+ CGF.EmitBlock(InitEndBB);
+ CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), OMPRTL___kmpc_barrier),
+ BarrierArgs);
+ for (unsigned I :
+ llvm::seq(std::min(ReductionOps.size(), LHSExprs.size()))) {
+
+ const Expr *ReductionOp = ReductionOps[I];
+ LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType);
+ LValue LHSLV = CGF.EmitLValue(LHSExprs[I]);
+ // If UDR
+ const OMPDeclareReductionDecl *CurrentUDR =
+ getReductionInit(ReductionOps[I]);
+ if (CurrentUDR) {
+ auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) {
+ Action.Enter(CGF);
+ std::pair ReductionFnPair =
+ getUserDefinedReduction(CurrentUDR);
+ llvm::Function *CombinerFn = ReductionFnPair.first;
+ if (const auto *CE = dyn_cast(ReductionOp)) {
+ if (CE && CombinerFn) {
+ const auto *CE = cast(ReductionOps[I]);
+ const auto *OutDRE = cast(
+ cast(CE->getArg(0)->IgnoreParenImpCasts())
+ ->getSubExpr());
+ const auto *InDRE = cast(
+ cast(CE->getArg(1)->IgnoreParenImpCasts())
+ ->getSubExpr());
+ CodeGenFunction::OMPPrivateScope LocalScope(CGF);
+ LocalScope.addPrivate(cast(OutDRE->getDecl()),
+ SharedLV.getAddress());
+ LocalScope.addPrivate(cast(InDRE->getDecl()),
+ LHSLV.getAddress());
+ (void)LocalScope.Privatize();
+ emitReductionCombiner(CGF, ReductionOp);
+ }
+ }
+ };
+ std::string CriticalName = getName({"reduction_critical"});
+ emitCriticalRegion(CGF, CriticalName, ReductionGen, Loc);
+ } else {
+ // Built-in Operator Combination
+ const Expr *ReductionClauseExpr = ReductionOp->IgnoreParenCasts();
+ if (const auto *Cleanup = dyn_cast(ReductionClauseExpr))
+ ReductionClauseExpr = Cleanup->getSubExpr()->IgnoreParenCasts();
+ const Expr *AssignRHS = nullptr;
+ if (const auto *BinOp = dyn_cast(ReductionClauseExpr)) {
+ if (BinOp->getOpcode() == BO_Assign)
+ AssignRHS = BinOp->getRHS();
+ } else if (const auto *OpCall =
+ dyn_cast(ReductionClauseExpr)) {
+ if (OpCall->getOperator() == OO_Equal)
+ AssignRHS = OpCall->getArg(1);
+ }
+ if (!AssignRHS)
+ continue;
+ const Expr *ReductionCombinerExpr = AssignRHS->IgnoreParenImpCasts();
+ if (const auto *MTE =
+ dyn_cast(ReductionCombinerExpr))
+ ReductionCombinerExpr = MTE->getSubExpr()->IgnoreParenImpCasts();
+
+ BinaryOperatorKind BO = BO_Assign;
+ RValue PrivateRV = CGF.EmitLoadOfLValue(LHSLV, Loc);
+ if (const auto *BinOp = dyn_cast(ReductionCombinerExpr)) {
+ BO = BinOp->getOpcode();
+ auto UpdateOp = [&](RValue OldVal) {
+ if (BO == BO_Mul) {
+ llvm::Value *OldScalar = OldVal.getScalarVal();
+ llvm::Value *PrivateScalar = PrivateRV.getScalarVal();
+ llvm::Value *Result =
+ CGF.Builder.CreateMul(OldScalar, PrivateScalar);
+ return RValue::get(Result);
+ } else {
+ OpaqueValueExpr OVE(BinOp->getLHS()->getExprLoc(),
+ BinOp->getLHS()->getType(),
+ ExprValueKind::VK_PRValue);
+ CodeGenFunction::OpaqueValueMapping OldValMapping(CGF, &OVE,
+ OldVal);
+ return CGF.EmitAnyExpr(BinOp->getRHS());
+ }
+ };
+
+ (void)CGF.EmitOMPAtomicSimpleUpdateExpr(
+ SharedLV, PrivateRV, BO, true,
+ llvm::AtomicOrdering::SequentiallyConsistent, Loc, UpdateOp);
+ } else if (dyn_cast(ReductionClauseExpr)) {
+ // Implicit Reduction Identifiers ( openmp 6.0 sec 7.6.5 )
+ auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) {
+ Action.Enter(CGF);
+ const auto *OmpOutDRE =
+ dyn_cast(LHSExprs[I]->IgnoreParenImpCasts());
+ const auto *OmpInDRE =
+ dyn_cast(RHSExprs[I]->IgnoreParenImpCasts());
+
+ if (!OmpOutDRE || !OmpInDRE) {
+ return;
+ }
+ const VarDecl *OmpOutVD = cast(OmpOutDRE->getDecl());
+ const VarDecl *OmpInVD = cast(OmpInDRE->getDecl());
+ CodeGenFunction::OMPPrivateScope LocalScope(CGF);
+ LocalScope.addPrivate(OmpOutVD, SharedLV.getAddress());
+ LocalScope.addPrivate(OmpInVD, LHSLV.getAddress());
+ (void)LocalScope.Privatize();
+ CGF.EmitIgnoredExpr(ReductionOp);
+ };
+ std::string CriticalName = getName({"reduction_critical"});
+ emitCriticalRegion(CGF, CriticalName, ReductionGen, Loc);
+ }
+ }
+ }
+ // Final barrier
+ CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), OMPRTL___kmpc_barrier),
+ BarrierArgs);
+
+ // Broadcast final result
+ bool IsAggregate = PrivateType->isAggregateType();
+ llvm::Value *FinalResultVal = nullptr;
+ LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType);
+ Address FinalResultAddr = Address::invalid();
+ if (IsAggregate) {
+ FinalResultAddr = SharedResult;
+ } else {
+ FinalResultVal = CGF.EmitLoadOfScalar(SharedLV, Loc);
+ }
+
+ for (unsigned I : llvm::seq(Privates.size())) {
+ LValue TargetLHSLV = CGF.EmitLValue(LHSExprs[I]);
+ if (IsAggregate) {
+ CGF.EmitAggregateCopy(TargetLHSLV,
+ CGF.MakeAddrLValue(FinalResultAddr, PrivateType),
+ PrivateType, AggValueSlot::DoesNotOverlap, false);
+ } else {
+ CGF.EmitStoreOfScalar(FinalResultVal, TargetLHSLV);
+ }
+ }
+
+ // Final synchronization
+ CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), OMPRTL___kmpc_barrier),
+ BarrierArgs);
+}
+
void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
ArrayRef Privates,
ArrayRef LHSExprs,
@@ -5201,6 +5433,8 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
CGF.EmitBranch(DefaultBB);
CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
+ if (Options.IsPrivateVarReduction)
+ emitPrivateReduction(CGF, Loc, Privates, LHSExprs, RHSExprs, ReductionOps);
}
/// Generates unique name for artificial threadprivate variables.
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index 4321712e1521d..50ba28b565b6d 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -1201,8 +1201,22 @@ class CGOpenMPRuntime {
struct ReductionOptionsTy {
bool WithNowait;
bool SimpleReduction;
+ bool IsPrivateVarReduction;
OpenMPDirectiveKind ReductionKind;
};
+
+ /// Emits code for private variable reduction
+ /// \param Privates List of private copies for original reduction arguments.
+ /// \param LHSExprs List of LHS in \a ReductionOps reduction operations.
+ /// \param RHSExprs List of RHS in \a ReductionOps reduction operations.
+ /// \param ReductionOps List of reduction operations in form 'LHS binop RHS'
+ /// or 'operator binop(LHS, RHS)'.
+ void emitPrivateReduction(CodeGenFunction &CGF, SourceLocation Loc,
+ ArrayRef Privates,
+ ArrayRef LHSExprs,
+ ArrayRef RHSExprs,
+ ArrayRef ReductionOps);
+
/// Emit a code for reduction clause. Next code should be emitted for
/// reduction:
/// \code
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index e4d1db264aac9..720a88e075ddd 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -1470,6 +1470,7 @@ void CodeGenFunction::EmitOMPReductionClauseFinal(
llvm::SmallVector LHSExprs;
llvm::SmallVector RHSExprs;
llvm::SmallVector ReductionOps;
+ llvm::SmallVector IsPrivate;
bool HasAtLeastOneReduction = false;
bool IsReductionWithTaskMod = false;
for (const auto *C : D.getClausesOfKind()) {
@@ -1480,6 +1481,8 @@ void CodeGenFunction::EmitOMPReductionClauseFinal(
Privates.append(C->privates().begin(), C->privates().end());
LHSExprs.append(C->lhs_exprs().begin(), C->lhs_exprs().end());
RHSExprs.append(C->rhs_exprs().begin(), C->rhs_exprs().end());
+ IsPrivate.append(C->private_var_reduction_flags().begin(),
+ C->private_var_reduction_flags().end());
ReductionOps.append(C->reduction_ops().begin(), C->reduction_ops().end());
IsReductionWithTaskMod =
IsReductionWithTaskMod || C->getModifier() == OMPC_REDUCTION_task;
@@ -1499,9 +1502,11 @@ void CodeGenFunction::EmitOMPReductionClauseFinal(
bool SimpleReduction = ReductionKind == OMPD_simd;
// Emit nowait reduction if nowait clause is present or directive is a
// parallel directive (it always has implicit barrier).
+ bool IsPrivateVarReduction =
+ llvm::any_of(IsPrivate, [](bool IsPriv) { return IsPriv; });
CGM.getOpenMPRuntime().emitReduction(
*this, D.getEndLoc(), Privates, LHSExprs, RHSExprs, ReductionOps,
- {WithNowait, SimpleReduction, ReductionKind});
+ {WithNowait, SimpleReduction, IsPrivateVarReduction, ReductionKind});
}
}
@@ -3943,7 +3948,8 @@ static void emitScanBasedDirective(
PrivScope.Privatize();
CGF.CGM.getOpenMPRuntime().emitReduction(
CGF, S.getEndLoc(), Privates, LHSs, RHSs, ReductionOps,
- {/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_unknown});
+ {/*WithNowait=*/true, /*SimpleReduction=*/true,
+ /*IsPrivateVarReduction */ false, OMPD_unknown});
}
llvm::Value *NextIVal =
CGF.Builder.CreateNUWSub(IVal, llvm::ConstantInt::get(CGF.SizeTy, 1));
@@ -5747,7 +5753,7 @@ void CodeGenFunction::EmitOMPScanDirective(const OMPScanDirective &S) {
}
CGM.getOpenMPRuntime().emitReduction(
*this, ParentDir.getEndLoc(), Privates, LHSs, RHSs, ReductionOps,
- {/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_simd});
+ {/*WithNowait=*/true, /*SimpleReduction=*/true, false, OMPD_simd});
for (unsigned I = 0, E = CopyArrayElems.size(); I < E; ++I) {
const Expr *PrivateExpr = Privates[I];
LValue DestLVal;
diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp
new file mode 100644
index 0000000000000..313d0997f18e3
--- /dev/null
+++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp
@@ -0,0 +1,555 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --prefix-filecheck-ir-name _ --version 5
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -fopenmp-version=60 -x c++ -std=c++17 -emit-llvm %s -o - | FileCheck %s
+// expected-no-diagnostics
+#define N 10
+class Sum {
+ int val;
+public:
+ Sum(int v = 0) : val(v) {}
+ Sum operator+(const Sum& rhs) const {
+ return Sum(val + rhs.val);
+ }
+ Sum& operator+=(const Sum& rhs) {
+ val += rhs.val;
+ return *this;
+ }
+};
+#pragma omp declare reduction(sum_reduction : Sum : omp_out += omp_in) initializer(omp_priv = Sum(0))
+
+void func_red(){
+ Sum result(0);
+ Sum array[N];
+
+ for (int i = 0; i < N; i++) {
+ array[i] = Sum(i);
+ }
+
+ #pragma omp parallel private(result) num_threads(4)
+ {
+ #pragma omp for reduction(sum_reduction:result)
+ for (int i = 0; i < N; i++) {
+ result = result + array[i];
+ }
+ }
+}
+
+void do_red(int n, int *v, int &sum_v)
+ {
+ sum_v = 0;
+ #pragma omp for reduction(original(private),+: sum_v)
+ for (int i = 0; i < n; i++)
+ {
+ sum_v += v[i];
+ }
+ }
+ int main(void)
+ {
+ int v[N];
+ for (int i = 0; i < N; i++)
+ v[i] = i;
+ #pragma omp parallel num_threads(4)
+ {
+ int s_v;
+ do_red(N, v, s_v);
+ }
+ return 0;
+ }
+
+//.
+// CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
+// CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+// CHECK: @.gomp_critical_user_.reduction.var = common global [8 x i32] zeroinitializer, align 8
+// CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 18, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+// CHECK: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+// CHECK: @.gomp_critical_user_.atomic_reduction.var = common global [8 x i32] zeroinitializer, align 8
+// CHECK: @.omp.reduction..internal_private_var = common global %class.Sum zeroinitializer, align 4
+// CHECK: @.gomp_critical_user_.reduction_critical.var = common global [8 x i32] zeroinitializer, align 8
+// CHECK: @[[GLOB4:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+// CHECK: @.omp.reduction..internal_private_var.1 = common global i32 0, align 4
+//.
+// CHECK-LABEL: define dso_local void @_Z8func_redv(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*]]:
+// CHECK-NEXT: [[RESULT:%.*]] = alloca [[CLASS_SUM:%.*]], align 4
+// CHECK-NEXT: [[ARRAY:%.*]] = alloca [10 x %class.Sum], align 16
+// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_SUM]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
+// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], i32 noundef 0)
+// CHECK-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[ARRAY]], i32 0, i32 0
+// CHECK-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[CLASS_SUM]], ptr [[ARRAY_BEGIN]], i64 10
+// CHECK-NEXT: br label %[[ARRAYCTOR_LOOP:.*]]
+// CHECK: [[ARRAYCTOR_LOOP]]:
+// CHECK-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], %[[ENTRY]] ], [ [[ARRAYCTOR_NEXT:%.*]], %[[ARRAYCTOR_LOOP]] ]
+// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]], i32 noundef 0)
+// CHECK-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[CLASS_SUM]], ptr [[ARRAYCTOR_CUR]], i64 1
+// CHECK-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK-NEXT: br i1 [[ARRAYCTOR_DONE]], label %[[ARRAYCTOR_CONT:.*]], label %[[ARRAYCTOR_LOOP]]
+// CHECK: [[ARRAYCTOR_CONT]]:
+// CHECK-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK-NEXT: br label %[[FOR_COND:.*]]
+// CHECK: [[FOR_COND]]:
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10
+// CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK: [[FOR_BODY]]:
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[REF_TMP]], i32 noundef [[TMP2]])
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP3]] to i64
+// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ARRAYIDX]], ptr align 4 [[REF_TMP]], i64 4, i1 false)
+// CHECK-NEXT: br label %[[FOR_INC:.*]]
+// CHECK: [[FOR_INC]]:
+// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1
+// CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4
+// CHECK-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// CHECK: [[FOR_END]]:
+// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4)
+// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @_Z8func_redv.omp_outlined, ptr [[ARRAY]])
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define linkonce_odr void @_ZN3SumC1Ei(
+// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[V:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[V_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT: store i32 [[V]], ptr [[V_ADDR]], align 4
+// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[V_ADDR]], align 4
+// CHECK-NEXT: call void @_ZN3SumC2Ei(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef [[TMP0]])
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define internal void @_Z8func_redv.omp_outlined(
+// CHECK-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[ARRAY:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[ARRAY_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[RESULT:%.*]] = alloca [[CLASS_SUM:%.*]], align 4
+// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[RESULT1:%.*]] = alloca [[CLASS_SUM]], align 4
+// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_SUM]], align 4
+// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NEXT: store ptr [[ARRAY]], ptr [[ARRAY_ADDR]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAY_ADDR]], align 8
+// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], i32 noundef 0)
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NEXT: call void @.omp_initializer.(ptr noundef [[RESULT1]], ptr noundef [[RESULT]])
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP2]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9
+// CHECK-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK: [[COND_TRUE]]:
+// CHECK-NEXT: br label %[[COND_END:.*]]
+// CHECK: [[COND_FALSE]]:
+// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: br label %[[COND_END]]
+// CHECK: [[COND_END]]:
+// CHECK-NEXT: [[COND:%.*]] = phi i32 [ 9, %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ]
+// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND:.*]]
+// CHECK: [[OMP_INNER_FOR_COND]]:
+// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK-NEXT: br i1 [[CMP2]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]]
+// CHECK: [[OMP_INNER_FOR_BODY]]:
+// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
+// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64
+// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK-NEXT: [[CALL:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]], ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYIDX]])
+// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP]], i32 0, i32 0
+// CHECK-NEXT: store i32 [[CALL]], ptr [[COERCE_DIVE]], align 4
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT1]], ptr align 4 [[REF_TMP]], i64 4, i1 false)
+// CHECK-NEXT: br label %[[OMP_BODY_CONTINUE:.*]]
+// CHECK: [[OMP_BODY_CONTINUE]]:
+// CHECK-NEXT: br label %[[OMP_INNER_FOR_INC:.*]]
+// CHECK: [[OMP_INNER_FOR_INC]]:
+// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND]]
+// CHECK: [[OMP_INNER_FOR_END]]:
+// CHECK-NEXT: br label %[[OMP_LOOP_EXIT:.*]]
+// CHECK: [[OMP_LOOP_EXIT]]:
+// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP2]])
+// CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// CHECK-NEXT: store ptr [[RESULT1]], ptr [[TMP11]], align 8
+// CHECK-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z8func_redv.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT: switch i32 [[TMP12]], [[DOTOMP_REDUCTION_DEFAULT:label %.*]] [
+// CHECK-NEXT: i32 1, [[DOTOMP_REDUCTION_CASE1:label %.*]]
+// CHECK-NEXT: i32 2, [[DOTOMP_REDUCTION_CASE2:label %.*]]
+// CHECK-NEXT: ]
+// CHECK: [[_OMP_REDUCTION_CASE1:.*:]]
+// CHECK-NEXT: call void @.omp_combiner.(ptr noundef [[RESULT]], ptr noundef [[RESULT1]])
+// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK: [[_OMP_REDUCTION_CASE2:.*:]]
+// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.atomic_reduction.var)
+// CHECK-NEXT: call void @.omp_combiner.(ptr noundef [[RESULT]], ptr noundef [[RESULT1]])
+// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.atomic_reduction.var)
+// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK: [[_OMP_REDUCTION_DEFAULT:.*:]]
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP2]], 0
+// CHECK-NEXT: br i1 [[TMP13]], label %[[INIT:.*]], label %[[INIT_END:.*]]
+// CHECK: [[INIT]]:
+// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 @.omp.reduction..internal_private_var, i8 0, i64 4, i1 false)
+// CHECK-NEXT: br label %[[INIT_END]]
+// CHECK: [[INIT_END]]:
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT: call void @.omp_combiner.(ptr noundef @.omp.reduction..internal_private_var, ptr noundef [[RESULT]])
+// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK-NEXT: [[TMP14:%.*]] = load [[CLASS_SUM]], ptr @.omp.reduction..internal_private_var, align 4
+// CHECK-NEXT: store [[CLASS_SUM]] [[TMP14]], ptr [[RESULT]], align 4
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP2]])
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define internal void @.omp_combiner.(
+// CHECK-SAME: ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK-NEXT: [[CALL:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN3SumpLERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[TMP3]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP2]])
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define linkonce_odr noundef nonnull align 4 dereferenceable(4) ptr @_ZN3SumpLERKS_(
+// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RHS:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[RHS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT: store ptr [[RHS]], ptr [[RHS_ADDR]], align 8
+// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[RHS_ADDR]], align 8
+// CHECK-NEXT: [[VAL:%.*]] = getelementptr inbounds nuw [[CLASS_SUM:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[VAL]], align 4
+// CHECK-NEXT: [[VAL2:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[THIS1]], i32 0, i32 0
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[VAL2]], align 4
+// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP1]]
+// CHECK-NEXT: store i32 [[ADD]], ptr [[VAL2]], align 4
+// CHECK-NEXT: ret ptr [[THIS1]]
+//
+//
+// CHECK-LABEL: define internal void @.omp_initializer.(
+// CHECK-SAME: ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[TMP3]], i32 noundef 0)
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define linkonce_odr i32 @_ZNK3SumplERKS_(
+// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RHS:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[CLASS_SUM:%.*]], align 4
+// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[RHS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT: store ptr [[RHS]], ptr [[RHS_ADDR]], align 8
+// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT: [[VAL:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[THIS1]], i32 0, i32 0
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[VAL]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[RHS_ADDR]], align 8
+// CHECK-NEXT: [[VAL2:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[TMP1]], i32 0, i32 0
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[VAL2]], align 4
+// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP2]]
+// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RETVAL]], i32 noundef [[ADD]])
+// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[COERCE_DIVE]], align 4
+// CHECK-NEXT: ret i32 [[TMP3]]
+//
+//
+// CHECK-LABEL: define internal void @_Z8func_redv.omp_outlined.omp.reduction.reduction_func(
+// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// CHECK-NEXT: call void @.omp_combiner.(ptr noundef [[TMP7]], ptr noundef [[TMP5]])
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define linkonce_odr void @_ZN3SumC2Ei(
+// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[V:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[V_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT: store i32 [[V]], ptr [[V_ADDR]], align 4
+// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT: [[VAL:%.*]] = getelementptr inbounds nuw [[CLASS_SUM:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[V_ADDR]], align 4
+// CHECK-NEXT: store i32 [[TMP0]], ptr [[VAL]], align 4
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define dso_local void @_Z6do_rediPiRi(
+// CHECK-SAME: i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[SUM_V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[TMP:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[SUM_V4:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[_TMP5:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[I6:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
+// CHECK-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8
+// CHECK-NEXT: store ptr [[SUM_V]], ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT: store i32 0, ptr [[TMP1]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0
+// CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-NEXT: store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[OMP_PRECOND_THEN:.*]], label %[[OMP_PRECOND_END:.*]]
+// CHECK: [[OMP_PRECOND_THEN]]:
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK-NEXT: store i32 0, ptr [[SUM_V4]], align 4
+// CHECK-NEXT: store ptr [[SUM_V4]], ptr [[_TMP5]], align 8
+// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+// CHECK-NEXT: br i1 [[CMP7]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK: [[COND_TRUE]]:
+// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-NEXT: br label %[[COND_END:.*]]
+// CHECK: [[COND_FALSE]]:
+// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: br label %[[COND_END]]
+// CHECK: [[COND_END]]:
+// CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], %[[COND_TRUE]] ], [ [[TMP11]], %[[COND_FALSE]] ]
+// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND:.*]]
+// CHECK: [[OMP_INNER_FOR_COND]]:
+// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[CMP8:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
+// CHECK-NEXT: br i1 [[CMP8]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]]
+// CHECK: [[OMP_INNER_FOR_BODY]]:
+// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
+// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NEXT: store i32 [[ADD]], ptr [[I6]], align 4
+// CHECK-NEXT: [[TMP16:%.*]] = load ptr, ptr [[V_ADDR]], align 8
+// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[I6]], align 4
+// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
+// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[IDXPROM]]
+// CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK-NEXT: [[TMP19:%.*]] = load ptr, ptr [[_TMP5]], align 8
+// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+// CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP20]], [[TMP18]]
+// CHECK-NEXT: store i32 [[ADD9]], ptr [[TMP19]], align 4
+// CHECK-NEXT: br label %[[OMP_BODY_CONTINUE:.*]]
+// CHECK: [[OMP_BODY_CONTINUE]]:
+// CHECK-NEXT: br label %[[OMP_INNER_FOR_INC:.*]]
+// CHECK: [[OMP_INNER_FOR_INC]]:
+// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP21]], 1
+// CHECK-NEXT: store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND]]
+// CHECK: [[OMP_INNER_FOR_END]]:
+// CHECK-NEXT: br label %[[OMP_LOOP_EXIT:.*]]
+// CHECK: [[OMP_LOOP_EXIT]]:
+// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// CHECK-NEXT: store ptr [[SUM_V4]], ptr [[TMP22]], align 8
+// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP0]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z6do_rediPiRi.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT: switch i32 [[TMP23]], [[DOTOMP_REDUCTION_DEFAULT:label %.*]] [
+// CHECK-NEXT: i32 1, [[DOTOMP_REDUCTION_CASE1:label %.*]]
+// CHECK-NEXT: i32 2, [[DOTOMP_REDUCTION_CASE2:label %.*]]
+// CHECK-NEXT: ]
+// CHECK: [[_OMP_REDUCTION_CASE1:.*:]]
+// CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[SUM_V4]], align 4
+// CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP24]], [[TMP25]]
+// CHECK-NEXT: store i32 [[ADD11]], ptr [[TMP7]], align 4
+// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK: [[_OMP_REDUCTION_CASE2:.*:]]
+// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[SUM_V4]], align 4
+// CHECK-NEXT: [[TMP27:%.*]] = atomicrmw add ptr [[TMP7]], i32 [[TMP26]] monotonic, align 4
+// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK: [[_OMP_REDUCTION_DEFAULT:.*:]]
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT: [[TMP28:%.*]] = icmp eq i32 [[TMP0]], 0
+// CHECK-NEXT: br i1 [[TMP28]], label %[[INIT:.*]], label %[[INIT_END:.*]]
+// CHECK: [[INIT]]:
+// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 @.omp.reduction..internal_private_var.1, i8 0, i64 4, i1 false)
+// CHECK-NEXT: br label %[[INIT_END]]
+// CHECK: [[INIT_END]]:
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK-NEXT: [[TMP30:%.*]] = atomicrmw add ptr @.omp.reduction..internal_private_var.1, i32 [[TMP29]] seq_cst, align 4
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr @.omp.reduction..internal_private_var.1, align 4
+// CHECK-NEXT: store i32 [[TMP31]], ptr [[TMP7]], align 4
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT: br label %[[OMP_PRECOND_END]]
+// CHECK: [[OMP_PRECOND_END]]:
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP0]])
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define internal void @_Z6do_rediPiRi.omp.reduction.reduction_func(
+// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define dso_local noundef i32 @main(
+// CHECK-SAME: ) #[[ATTR8:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[V:%.*]] = alloca [10 x i32], align 16
+// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
+// CHECK-NEXT: store i32 0, ptr [[RETVAL]], align 4
+// CHECK-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK-NEXT: br label %[[FOR_COND:.*]]
+// CHECK: [[FOR_COND]]:
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10
+// CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK: [[FOR_BODY]]:
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP3]] to i64
+// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[V]], i64 0, i64 [[IDXPROM]]
+// CHECK-NEXT: store i32 [[TMP2]], ptr [[ARRAYIDX]], align 4
+// CHECK-NEXT: br label %[[FOR_INC:.*]]
+// CHECK: [[FOR_INC]]:
+// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1
+// CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4
+// CHECK-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
+// CHECK: [[FOR_END]]:
+// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4)
+// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @main.omp_outlined, ptr [[V]])
+// CHECK-NEXT: ret i32 0
+//
+//
+// CHECK-LABEL: define internal void @main.omp_outlined(
+// CHECK-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[V:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[S_V:%.*]] = alloca i32, align 4
+// CHECK-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[V_ADDR]], align 8
+// CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 0
+// CHECK-NEXT: call void @_Z6do_rediPiRi(i32 noundef 10, ptr noundef [[ARRAYDECAY]], ptr noundef nonnull align 4 dereferenceable(4) [[S_V]])
+// CHECK-NEXT: ret void
+//
+//.
+// CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+// CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+// CHECK: attributes #[[ATTR2]] = { noinline norecurse nounwind optnone "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+// CHECK: attributes #[[ATTR3]] = { noinline nounwind "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+// CHECK: attributes #[[ATTR4:[0-9]+]] = { nounwind }
+// CHECK: attributes #[[ATTR5]] = { noinline norecurse nounwind "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+// CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nounwind }
+// CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
+// CHECK: attributes #[[ATTR8]] = { mustprogress noinline norecurse nounwind optnone "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+//.
+// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp", i32 60}
+// CHECK: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+// CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
+// CHECK: [[META4]] = !{!"llvm.loop.mustprogress"}
+// CHECK: [[META5:![0-9]+]] = !{[[META6:![0-9]+]]}
+// CHECK: [[META6]] = !{i64 2, i64 -1, i64 -1, i1 true}
+// CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]]}
+//.