Skip to content

Commit 42166b6

Browse files
authored
[LV] Update forced epilogue VF options to allow different VFs than main. (llvm#190393)
Previously, forced epilogue vector factors via the command line options required to match the forced main VF (or the VF to be built in general). This leads to a number of akward tests, where we end up with dead epilogue vector loops. Update the logic to build an additional VPlan with the epilogue vector factor, and require the provided epilogue VF to be < IC * MainLoopVF. Otherwise, epilogue vectorization is skipped. This only impacts the forced epilogue VF option used for testing and ensures epilogue tests to cover more realistic scenarios and make them more robust w.r.t. to additional VPlan-based folding. PR: llvm#190393
1 parent 93c349b commit 42166b6

16 files changed

Lines changed: 785 additions & 639 deletions

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4375,6 +4375,16 @@ std::unique_ptr<VPlan> LoopVectorizationPlanner::selectBestEpiloguePlan(
43754375
}
43764376

43774377
if (EpilogueVectorizationForceVF > 1) {
4378+
if (EpilogueVectorizationForceVF >=
4379+
IC * estimateElementCount(MainLoopVF, CM.getVScaleForTuning())) {
4380+
// Note that the main loop leaves IC * MainLoopVF iterations iff a scalar
4381+
// epilogue is required, but then the epilogue loop also requires a scalar
4382+
// epilogue.
4383+
LLVM_DEBUG(dbgs() << "LEV: Forced epilogue VF results in dead epilogue "
4384+
"vector loop, skipping vectorizing epilogue.\n");
4385+
return nullptr;
4386+
}
4387+
43784388
LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
43794389
ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
43804390
if (hasPlanWithVF(ForcedEC)) {
@@ -6792,6 +6802,14 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
67926802
CM.collectInLoopReductions();
67936803
if (CM.selectUserVectorizationFactor(UserVF)) {
67946804
LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6805+
ElementCount EpilogueUserVF =
6806+
ElementCount::getFixed(EpilogueVectorizationForceVF);
6807+
if (EpilogueUserVF.isVector() &&
6808+
ElementCount::isKnownLT(EpilogueUserVF, UserVF) &&
6809+
CM.selectUserVectorizationFactor(EpilogueUserVF)) {
6810+
// Build a separate plan for the forced epilogue VF.
6811+
buildVPlansWithVPRecipes(EpilogueUserVF, EpilogueUserVF);
6812+
}
67956813
buildVPlansWithVPRecipes(UserVF, UserVF);
67966814
LLVM_DEBUG(printPlans(dbgs()));
67976815
return;
@@ -7179,8 +7197,23 @@ LoopVectorizationPlanner::computeBestVF() {
71797197
return {VectorizationFactor::Disabled(), nullptr};
71807198
// If there is a single VPlan with a single VF, return it directly.
71817199
VPlan &FirstPlan = *VPlans[0];
7182-
if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7183-
return {{FirstPlan.getSingleVF(), 0, 0}, &FirstPlan};
7200+
ElementCount UserVF = Hints.getWidth();
7201+
if (hasPlanWithVF(UserVF)) {
7202+
if (VPlans.size() == 1) {
7203+
assert(FirstPlan.getSingleVF() == UserVF &&
7204+
"UserVF must match single VF");
7205+
return {VectorizationFactor(FirstPlan.getSingleVF(), 0, 0), &FirstPlan};
7206+
}
7207+
if (EpilogueVectorizationForceVF > 1) {
7208+
assert(VPlans.size() == 2 && "Must have exactly 2 VPlans built");
7209+
assert(VPlans[0]->getSingleVF() ==
7210+
ElementCount::getFixed(EpilogueVectorizationForceVF) &&
7211+
"expected first plan to be for the forced epilogue VF");
7212+
assert(VPlans[1]->getSingleVF() == UserVF &&
7213+
"expected second plan to be for the forced UserVF");
7214+
return {VectorizationFactor(UserVF, 0, 0), VPlans[1].get()};
7215+
}
7216+
}
71847217

71857218
LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
71867219
<< (CM.CostKind == TTI::TCK_RecipThroughput

llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll

Lines changed: 97 additions & 97 deletions
Large diffs are not rendered by default.

llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll

Lines changed: 76 additions & 68 deletions
Large diffs are not rendered by default.

llvm/test/Transforms/LoopVectorize/branch-weights.ll

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "br " --filter "^.*:" --filter "icmp" --version 5
2-
; RUN: opt < %s -S -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-epilogue-vectorization \
2+
; RUN: opt < %s -S -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=8 -enable-epilogue-vectorization \
33
; RUN: -epilogue-vectorization-force-VF=4 | FileCheck %s --check-prefix=MAINVF4IC1_EPI4
44
; RUN: opt < %s -S -passes=loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -enable-epilogue-vectorization \
55
; RUN: -epilogue-vectorization-force-VF=4 | FileCheck %s --check-prefix=MAINVF4IC2_EPI4
@@ -18,7 +18,7 @@ define void @f0(i8 %n, i32 %len, ptr %p) !prof !0 {
1818
; MAINVF4IC1_EPI4: [[TMP3:%.*]] = icmp ugt i32 [[LEN]], 255
1919
; MAINVF4IC1_EPI4: br i1 [[TMP4:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]], !prof [[PROF2]]
2020
; MAINVF4IC1_EPI4: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
21-
; MAINVF4IC1_EPI4: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP0]], 4
21+
; MAINVF4IC1_EPI4: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP0]], 8
2222
; MAINVF4IC1_EPI4: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF2]]
2323
; MAINVF4IC1_EPI4: [[VECTOR_PH]]:
2424
; MAINVF4IC1_EPI4: br label %[[VECTOR_BODY:.*]]
@@ -38,12 +38,12 @@ define void @f0(i8 %n, i32 %len, ptr %p) !prof !0 {
3838
; MAINVF4IC1_EPI4: br i1 [[TMP12]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !prof [[PROF10:![0-9]+]], !llvm.loop [[LOOP11:![0-9]+]]
3939
; MAINVF4IC1_EPI4: [[VEC_EPILOG_MIDDLE_BLOCK]]:
4040
; MAINVF4IC1_EPI4: [[CMP_N8:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC3]]
41-
; MAINVF4IC1_EPI4: br i1 [[CMP_N8]], label %[[EXIT_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF8]]
41+
; MAINVF4IC1_EPI4: br i1 [[CMP_N8]], label %[[EXIT_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF13:![0-9]+]]
4242
; MAINVF4IC1_EPI4: [[VEC_EPILOG_SCALAR_PH]]:
4343
; MAINVF4IC1_EPI4: br label %[[LOOP:.*]]
4444
; MAINVF4IC1_EPI4: [[LOOP]]:
4545
; MAINVF4IC1_EPI4: [[CMP_LOOP:%.*]] = icmp ult i32 [[I32:%.*]], [[LEN]]
46-
; MAINVF4IC1_EPI4: br i1 [[CMP_LOOP]], label %[[LOOP]], label %[[EXIT_LOOPEXIT]], !prof [[PROF13:![0-9]+]], !llvm.loop [[LOOP14:![0-9]+]]
46+
; MAINVF4IC1_EPI4: br i1 [[CMP_LOOP]], label %[[LOOP]], label %[[EXIT_LOOPEXIT]], !prof [[PROF14:![0-9]+]], !llvm.loop [[LOOP15:![0-9]+]]
4747
; MAINVF4IC1_EPI4: [[EXIT_LOOPEXIT]]:
4848
; MAINVF4IC1_EPI4: br label %[[EXIT]]
4949
; MAINVF4IC1_EPI4: [[EXIT]]:
@@ -119,19 +119,20 @@ exit:
119119
; MAINVF4IC1_EPI4: [[PROF0]] = !{!"function_entry_count", i64 13}
120120
; MAINVF4IC1_EPI4: [[PROF1]] = !{!"branch_weights", i32 12, i32 1}
121121
; MAINVF4IC1_EPI4: [[PROF2]] = !{!"branch_weights", i32 1, i32 127}
122-
; MAINVF4IC1_EPI4: [[PROF3]] = !{!"branch_weights", i32 1, i32 307}
122+
; MAINVF4IC1_EPI4: [[PROF3]] = !{!"branch_weights", i32 1, i32 153}
123123
; MAINVF4IC1_EPI4: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
124124
; MAINVF4IC1_EPI4: [[META5]] = !{!"llvm.loop.isvectorized", i32 1}
125125
; MAINVF4IC1_EPI4: [[META6]] = !{!"llvm.loop.unroll.runtime.disable"}
126-
; MAINVF4IC1_EPI4: [[META7]] = !{!"llvm.loop.estimated_trip_count", i32 308}
127-
; MAINVF4IC1_EPI4: [[PROF8]] = !{!"branch_weights", i32 1, i32 3}
128-
; MAINVF4IC1_EPI4: [[PROF9]] = !{!"branch_weights", i32 4, i32 0}
126+
; MAINVF4IC1_EPI4: [[META7]] = !{!"llvm.loop.estimated_trip_count", i32 154}
127+
; MAINVF4IC1_EPI4: [[PROF8]] = !{!"branch_weights", i32 1, i32 7}
128+
; MAINVF4IC1_EPI4: [[PROF9]] = !{!"branch_weights", i32 4, i32 4}
129129
; MAINVF4IC1_EPI4: [[PROF10]] = !{!"branch_weights", i32 1, i32 0}
130130
; MAINVF4IC1_EPI4: [[LOOP11]] = distinct !{[[LOOP11]], [[META5]], [[META6]], [[META12:![0-9]+]]}
131131
; MAINVF4IC1_EPI4: [[META12]] = !{!"llvm.loop.estimated_trip_count", i32 0}
132-
; MAINVF4IC1_EPI4: [[PROF13]] = !{!"branch_weights", i32 2, i32 1}
133-
; MAINVF4IC1_EPI4: [[LOOP14]] = distinct !{[[LOOP14]], [[META5]], [[META15:![0-9]+]]}
134-
; MAINVF4IC1_EPI4: [[META15]] = !{!"llvm.loop.estimated_trip_count", i32 3}
132+
; MAINVF4IC1_EPI4: [[PROF13]] = !{!"branch_weights", i32 1, i32 3}
133+
; MAINVF4IC1_EPI4: [[PROF14]] = !{!"branch_weights", i32 2, i32 1}
134+
; MAINVF4IC1_EPI4: [[LOOP15]] = distinct !{[[LOOP15]], [[META5]], [[META16:![0-9]+]]}
135+
; MAINVF4IC1_EPI4: [[META16]] = !{!"llvm.loop.estimated_trip_count", i32 3}
135136
;.
136137
; MAINVF4IC2_EPI4: [[PROF0]] = !{!"function_entry_count", i64 13}
137138
; MAINVF4IC2_EPI4: [[PROF1]] = !{!"branch_weights", i32 12, i32 1}

0 commit comments

Comments
 (0)