Skip to content

Commit 92a64f9

Browse files
committed
[VPlan] Compute interleave count for VPlan.
Move selectInterleaveCount to LoopVectorizationPlanner and retrieve some information directly from VPlan. Register pressure was already computed for a VPlan, and with this patch we now also check for reductions directly on VPlan, as well as checking how many load and store operations remain in the loop. This should be mostly NFC, but we may compute slightly different interleave counts, except for some edge cases, e.g. where dead loads have been removed. This shouldn't happen in practice, and the patch doesn't cause changes across a large test corpus on AArch64. Computing the interleave count based on VPlan allows for making better decisions in presence of VPlan optimizations, for example when operations on interleave groups are narrowed. Note that there are a few test changes for tests that were still checking the legacy cost-model output when it was computed in selectInterleaveCount.
1 parent c300a99 commit 92a64f9

File tree

5 files changed

+97
-55
lines changed

5 files changed

+97
-55
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -486,6 +486,9 @@ class LoopVectorizationPlanner {
486486
/// all profitable VFs in ProfitableVFs.
487487
VectorizationFactor computeBestVF();
488488

489+
unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF,
490+
InstructionCost LoopCost);
491+
489492
/// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
490493
/// according to the best selected \p VF and \p UF.
491494
///

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 74 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -974,13 +974,6 @@ class LoopVectorizationCostModel {
974974
/// 64 bit loop indices.
975975
std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
976976

977-
/// \return The desired interleave count.
978-
/// If interleave count has been specified by metadata it will be returned.
979-
/// Otherwise, the interleave count is computed and returned. VF and LoopCost
980-
/// are the selected vectorization factor and the cost of the selected VF.
981-
unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF,
982-
InstructionCost LoopCost);
983-
984977
/// Memory access instruction may be vectorized in more than one way.
985978
/// Form of instruction after vectorization depends on cost.
986979
/// This function takes cost-based decisions for Load/Store instructions
@@ -4653,8 +4646,8 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
46534646
}
46544647

46554648
unsigned
4656-
LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4657-
InstructionCost LoopCost) {
4649+
LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4650+
InstructionCost LoopCost) {
46584651
// -- The interleave heuristics --
46594652
// We interleave the loop in order to expose ILP and reduce the loop overhead.
46604653
// There are many micro-architectural considerations that we can't predict
@@ -4669,11 +4662,11 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
46694662
// 3. We don't interleave if we think that we will spill registers to memory
46704663
// due to the increased register pressure.
46714664

4672-
if (!isScalarEpilogueAllowed())
4665+
if (!CM.isScalarEpilogueAllowed())
46734666
return 1;
46744667

4675-
// Do not interleave if EVL is preferred and no User IC is specified.
4676-
if (foldTailWithEVL()) {
4668+
if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4669+
IsaPred<VPEVLBasedIVPHIRecipe>)) {
46774670
LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
46784671
"Unroll factor forced to be 1.\n");
46794672
return 1;
@@ -4686,15 +4679,20 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
46864679
// We don't attempt to perform interleaving for loops with uncountable early
46874680
// exits because the VPInstruction::AnyOf code cannot currently handle
46884681
// multiple parts.
4689-
if (Legal->hasUncountableEarlyExit())
4682+
if (Plan.hasEarlyExit())
46904683
return 1;
46914684

4692-
const bool HasReductions = !Legal->getReductionVars().empty();
4685+
const bool HasReductions =
4686+
any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4687+
IsaPred<VPReductionPHIRecipe>);
46934688

46944689
// If we did not calculate the cost for VF (because the user selected the VF)
46954690
// then we calculate the cost of VF here.
46964691
if (LoopCost == 0) {
4697-
LoopCost = expectedCost(VF);
4692+
if (VF.isScalar())
4693+
LoopCost = CM.expectedCost(VF);
4694+
else
4695+
LoopCost = cost(Plan, VF);
46984696
assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
46994697

47004698
// Loop body is free and there is no need for interleaving.
@@ -4703,7 +4701,7 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
47034701
}
47044702

47054703
VPRegisterUsage R =
4706-
calculateRegisterUsageForPlan(Plan, {VF}, TTI, ValuesToIgnore)[0];
4704+
calculateRegisterUsageForPlan(Plan, {VF}, TTI, CM.ValuesToIgnore)[0];
47074705
// We divide by these constants so assume that we have at least one
47084706
// instruction that uses at least one register.
47094707
for (auto &Pair : R.MaxLocalUsers) {
@@ -4766,23 +4764,24 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
47664764

47674765
// Try to get the exact trip count, or an estimate based on profiling data or
47684766
// ConstantMax from PSE, failing that.
4769-
auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop);
4767+
auto BestKnownTC = getSmallBestKnownTC(PSE, OrigLoop);
47704768

47714769
// For fixed length VFs treat a scalable trip count as unknown.
47724770
if (BestKnownTC && (BestKnownTC->isFixed() || VF.isScalable())) {
47734771
// Re-evaluate trip counts and VFs to be in the same numerical space.
4774-
unsigned AvailableTC = estimateElementCount(*BestKnownTC, VScaleForTuning);
4775-
unsigned EstimatedVF = estimateElementCount(VF, VScaleForTuning);
4772+
unsigned AvailableTC =
4773+
estimateElementCount(*BestKnownTC, CM.getVScaleForTuning());
4774+
unsigned EstimatedVF = estimateElementCount(VF, CM.getVScaleForTuning());
47764775

47774776
// At least one iteration must be scalar when this constraint holds. So the
47784777
// maximum available iterations for interleaving is one less.
4779-
if (requiresScalarEpilogue(VF.isVector()))
4778+
if (CM.requiresScalarEpilogue(VF.isVector()))
47804779
--AvailableTC;
47814780

47824781
unsigned InterleaveCountLB = bit_floor(std::max(
47834782
1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
47844783

4785-
if (getSmallConstantTripCount(PSE.getSE(), TheLoop).isNonZero()) {
4784+
if (getSmallConstantTripCount(PSE.getSE(), OrigLoop).isNonZero()) {
47864785
// If the best known trip count is exact, we select between two
47874786
// prospective ICs, where
47884787
//
@@ -4843,7 +4842,7 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
48434842
// vectorized the loop we will have done the runtime check and so interleaving
48444843
// won't require further checks.
48454844
bool ScalarInterleavingRequiresPredication =
4846-
(VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
4845+
(VF.isScalar() && any_of(OrigLoop->blocks(), [this](BasicBlock *BB) {
48474846
return Legal->blockNeedsPredication(BB);
48484847
}));
48494848
bool ScalarInterleavingRequiresRuntimePointerCheck =
@@ -4866,8 +4865,39 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
48664865

48674866
// Interleave until store/load ports (estimated by max interleave count) are
48684867
// saturated.
4869-
unsigned NumStores = Legal->getNumStores();
4870-
unsigned NumLoads = Legal->getNumLoads();
4868+
unsigned NumStores = 0;
4869+
unsigned NumLoads = 0;
4870+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4871+
vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()))) {
4872+
for (VPRecipeBase &R : *VPBB) {
4873+
if (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(&R)) {
4874+
NumLoads++;
4875+
continue;
4876+
}
4877+
if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(&R)) {
4878+
NumStores++;
4879+
continue;
4880+
}
4881+
4882+
if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R)) {
4883+
if (unsigned StoreOps = InterleaveR->getNumStoreOperands())
4884+
NumStores += StoreOps;
4885+
else
4886+
NumLoads += InterleaveR->getNumDefinedValues();
4887+
continue;
4888+
}
4889+
if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4890+
NumLoads += isa<LoadInst>(RepR->getUnderlyingInstr());
4891+
NumStores += isa<StoreInst>(RepR->getUnderlyingInstr());
4892+
continue;
4893+
}
4894+
if (isa<VPHistogramRecipe>(&R)) {
4895+
NumLoads++;
4896+
NumStores++;
4897+
continue;
4898+
}
4899+
}
4900+
}
48714901
unsigned StoresIC = IC / (NumStores ? NumStores : 1);
48724902
unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
48734903

@@ -4877,12 +4907,15 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
48774907
// do the final reduction after the loop.
48784908
bool HasSelectCmpReductions =
48794909
HasReductions &&
4880-
any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
4881-
const RecurrenceDescriptor &RdxDesc = Reduction.second;
4882-
RecurKind RK = RdxDesc.getRecurrenceKind();
4883-
return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
4884-
RecurrenceDescriptor::isFindIVRecurrenceKind(RK);
4885-
});
4910+
any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4911+
[](VPRecipeBase &R) {
4912+
auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4913+
4914+
return RedR && (RecurrenceDescriptor::isAnyOfRecurrenceKind(
4915+
RedR->getRecurrenceKind()) ||
4916+
RecurrenceDescriptor::isFindIVRecurrenceKind(
4917+
RedR->getRecurrenceKind()));
4918+
});
48864919
if (HasSelectCmpReductions) {
48874920
LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
48884921
return 1;
@@ -4893,12 +4926,14 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
48934926
// we're interleaving is inside another loop. For tree-wise reductions
48944927
// set the limit to 2, and for ordered reductions it's best to disable
48954928
// interleaving entirely.
4896-
if (HasReductions && TheLoop->getLoopDepth() > 1) {
4929+
if (HasReductions && OrigLoop->getLoopDepth() > 1) {
48974930
bool HasOrderedReductions =
4898-
any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
4899-
const RecurrenceDescriptor &RdxDesc = Reduction.second;
4900-
return RdxDesc.isOrdered();
4901-
});
4931+
any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4932+
[](VPRecipeBase &R) {
4933+
auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4934+
4935+
return RedR && RedR->isOrdered();
4936+
});
49024937
if (HasOrderedReductions) {
49034938
LLVM_DEBUG(
49044939
dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
@@ -10114,8 +10149,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1011410149

1011510150
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
1011610151
if (LVP.hasPlanWithVF(VF.Width)) {
10152+
VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(),
10153+
CM, CM.CostKind);
10154+
1011710155
// Select the interleave count.
10118-
IC = CM.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
10156+
IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
1011910157

1012010158
unsigned SelectedIC = std::max(IC, UserIC);
1012110159
// Optimistically generate runtime checks if they are needed. Drop them if
@@ -10137,8 +10175,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1013710175
// Check if it is profitable to vectorize with runtime checks.
1013810176
bool ForceVectorization =
1013910177
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10140-
VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(),
10141-
CM, CM.CostKind);
1014210178
if (!ForceVectorization &&
1014310179
!isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
1014410180
LVP.getPlanFor(VF.Width), SEL,

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4229,7 +4229,10 @@ class VPlan {
42294229
/// block with multiple predecessors (one for the exit via the latch and one
42304230
/// via the other early exit).
42314231
bool hasEarlyExit() const {
4232-
return ExitBlocks.size() > 1 ||
4232+
return count_if(ExitBlocks,
4233+
[](VPIRBasicBlock *EB) {
4234+
return EB->getNumPredecessors() != 0;
4235+
}) > 1 ||
42334236
(ExitBlocks.size() == 1 && ExitBlocks[0]->getNumPredecessors() > 1);
42344237
}
42354238

llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ target triple = "aarch64--linux-gnu"
1919
; (udiv(2) + extractelement(8) + insertelement(4)) / 2 = 7
2020
;
2121
; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
22-
; CHECK: Found an estimated cost of 7 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
22+
; CHECK: Cost of 7 for VF 2: profitable to scalarize %tmp4 = udiv i32 %tmp2, %tmp3
2323
;
2424
define i32 @predicated_udiv(ptr %a, ptr %b, i1 %c, i64 %n) {
2525
entry:
@@ -60,7 +60,7 @@ for.end:
6060
; (store(4) + extractelement(4)) / 2 = 4
6161
;
6262
; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %tmp0, align 4
63-
; CHECK: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
63+
; CHECK: Cost of 4 for VF 2: profitable to scalarize store i32 %tmp2, ptr %tmp0, align 4
6464
;
6565
define void @predicated_store(ptr %a, i1 %c, i32 %x, i64 %n) {
6666
entry:
@@ -93,8 +93,8 @@ for.end:
9393
; CHECK: Found scalar instruction: %addr = phi ptr [ %a, %entry ], [ %addr.next, %for.inc ]
9494
; CHECK: Found scalar instruction: %addr.next = getelementptr inbounds i32, ptr %addr, i64 1
9595
; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %addr, align 4
96-
; CHECK: Found an estimated cost of 0 for VF 2 For instruction: %addr = phi ptr [ %a, %entry ], [ %addr.next, %for.inc ]
97-
; CHECK: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp2, ptr %addr, align 4
96+
; CHECK: Cost of 0 for VF 2: induction instruction %addr = phi ptr [ %a, %entry ], [ %addr.next, %for.inc ]
97+
; CHECK: Cost of 4 for VF 2: profitable to scalarize store i32 %tmp2, ptr %addr, align 4
9898
;
9999
define void @predicated_store_phi(ptr %a, i1 %c, i32 %x, i64 %n) {
100100
entry:
@@ -135,9 +135,10 @@ for.end:
135135
;
136136
; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x
137137
; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
138-
; CHECK: Found an estimated cost of 3 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x
139-
; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
138+
; CHECK: Cost of 3 for VF 2: profitable to scalarize %tmp3 = add nsw i32 %tmp2, %x
139+
; CHECK: Cost of 5 for VF 2: profitable to scalarize %tmp4 = udiv i32 %tmp2, %tmp3
140140
;
141+
141142
define i32 @predicated_udiv_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) {
142143
entry:
143144
br label %for.body
@@ -180,8 +181,8 @@ for.end:
180181
;
181182
; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x
182183
; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %tmp0, align 4
183-
; CHECK: Found an estimated cost of 3 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x
184-
; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
184+
; CHECK: Cost of 3 for VF 2: profitable to scalarize %tmp2 = add nsw i32 %tmp1, %x
185+
; CHECK: Cost of 2 for VF 2: profitable to scalarize store i32 %tmp2, ptr %tmp0, align 4
185186
;
186187
define void @predicated_store_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) {
187188
entry:
@@ -232,11 +233,11 @@ for.end:
232233
; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2
233234
; CHECK: Scalarizing: %tmp5 = sub i32 %tmp4, %x
234235
; CHECK: Scalarizing and predicating: store i32 %tmp5, ptr %tmp0, align 4
235-
; CHECK: Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x
236-
; CHECK: Found an estimated cost of 7 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2
237-
; CHECK: Found an estimated cost of 7 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2
238-
; CHECK: Found an estimated cost of 3 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x
239-
; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, ptr %tmp0, align 4
236+
; CHECK: Cost of 7 for VF 2: profitable to scalarize %tmp4 = udiv i32 %tmp3, %tmp2
237+
; CHECK: Cost of 7 for VF 2: profitable to scalarize %tmp3 = sdiv i32 %tmp1, %tmp2
238+
; CHECK: Cost of 2 for VF 2: profitable to scalarize store i32 %tmp5, ptr %tmp0, align 4
239+
; CHECK: Cost of 3 for VF 2: profitable to scalarize %tmp5 = sub i32 %tmp4, %x
240+
; CHECK: Cost of 1 for VF 2: WIDEN ir<%tmp2> = add ir<%tmp1>, ir<%x>
240241
;
241242
define void @predication_multi_context(ptr %a, i1 %c, i32 %x, i64 %n) {
242243
entry:

llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ for.end:
2727
; CHECK: LV: Scalarizing: %tmp1 = load i32, ptr %tmp0, align 4
2828
; CHECK: LV: Scalarizing: store i32 %tmp2, ptr %tmp0, align 4
2929

30-
; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: %tmp1 = load i32, ptr %tmp0, align 4
31-
; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %tmp2, ptr %tmp0, align 4
30+
; CHECK: Cost of 4 for VF 4: REPLICATE ir<%tmp1> = load ir<%tmp0>
31+
; CHECK: Cost of 4 for VF 4: REPLICATE store ir<%tmp2>, ir<%tmp0>
3232
}
33-

0 commit comments

Comments
 (0)