@@ -974,13 +974,6 @@ class LoopVectorizationCostModel {
974
974
// / 64 bit loop indices.
975
975
std::pair<unsigned , unsigned > getSmallestAndWidestTypes ();
976
976
977
- // / \return The desired interleave count.
978
- // / If interleave count has been specified by metadata it will be returned.
979
- // / Otherwise, the interleave count is computed and returned. VF and LoopCost
980
- // / are the selected vectorization factor and the cost of the selected VF.
981
- unsigned selectInterleaveCount (VPlan &Plan, ElementCount VF,
982
- InstructionCost LoopCost);
983
-
984
977
// / Memory access instruction may be vectorized in more than one way.
985
978
// / Form of instruction after vectorization depends on cost.
986
979
// / This function takes cost-based decisions for Load/Store instructions
@@ -4653,8 +4646,8 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
4653
4646
}
4654
4647
4655
4648
unsigned
4656
- LoopVectorizationCostModel ::selectInterleaveCount (VPlan &Plan, ElementCount VF,
4657
- InstructionCost LoopCost) {
4649
+ LoopVectorizationPlanner ::selectInterleaveCount (VPlan &Plan, ElementCount VF,
4650
+ InstructionCost LoopCost) {
4658
4651
// -- The interleave heuristics --
4659
4652
// We interleave the loop in order to expose ILP and reduce the loop overhead.
4660
4653
// There are many micro-architectural considerations that we can't predict
@@ -4669,11 +4662,11 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4669
4662
// 3. We don't interleave if we think that we will spill registers to memory
4670
4663
// due to the increased register pressure.
4671
4664
4672
- if (!isScalarEpilogueAllowed ())
4665
+ if (!CM. isScalarEpilogueAllowed ())
4673
4666
return 1 ;
4674
4667
4675
- // Do not interleave if EVL is preferred and no User IC is specified.
4676
- if ( foldTailWithEVL ( )) {
4668
+ if ( any_of (Plan. getVectorLoopRegion ()-> getEntryBasicBlock ()-> phis (),
4669
+ IsaPred<VPEVLBasedIVPHIRecipe> )) {
4677
4670
LLVM_DEBUG (dbgs () << " LV: Preference for VP intrinsics indicated. "
4678
4671
" Unroll factor forced to be 1.\n " );
4679
4672
return 1 ;
@@ -4686,15 +4679,20 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4686
4679
// We don't attempt to perform interleaving for loops with uncountable early
4687
4680
// exits because the VPInstruction::AnyOf code cannot currently handle
4688
4681
// multiple parts.
4689
- if (Legal-> hasUncountableEarlyExit ())
4682
+ if (Plan. hasEarlyExit ())
4690
4683
return 1 ;
4691
4684
4692
- const bool HasReductions = !Legal->getReductionVars ().empty ();
4685
+ const bool HasReductions =
4686
+ any_of (Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis (),
4687
+ IsaPred<VPReductionPHIRecipe>);
4693
4688
4694
4689
// If we did not calculate the cost for VF (because the user selected the VF)
4695
4690
// then we calculate the cost of VF here.
4696
4691
if (LoopCost == 0 ) {
4697
- LoopCost = expectedCost (VF);
4692
+ if (VF.isScalar ())
4693
+ LoopCost = CM.expectedCost (VF);
4694
+ else
4695
+ LoopCost = cost (Plan, VF);
4698
4696
assert (LoopCost.isValid () && " Expected to have chosen a VF with valid cost" );
4699
4697
4700
4698
// Loop body is free and there is no need for interleaving.
@@ -4703,7 +4701,7 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4703
4701
}
4704
4702
4705
4703
VPRegisterUsage R =
4706
- calculateRegisterUsageForPlan (Plan, {VF}, TTI, ValuesToIgnore)[0 ];
4704
+ calculateRegisterUsageForPlan (Plan, {VF}, TTI, CM. ValuesToIgnore )[0 ];
4707
4705
// We divide by these constants so assume that we have at least one
4708
4706
// instruction that uses at least one register.
4709
4707
for (auto &Pair : R.MaxLocalUsers ) {
@@ -4766,23 +4764,24 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4766
4764
4767
4765
// Try to get the exact trip count, or an estimate based on profiling data or
4768
4766
// ConstantMax from PSE, failing that.
4769
- auto BestKnownTC = getSmallBestKnownTC (PSE, TheLoop );
4767
+ auto BestKnownTC = getSmallBestKnownTC (PSE, OrigLoop );
4770
4768
4771
4769
// For fixed length VFs treat a scalable trip count as unknown.
4772
4770
if (BestKnownTC && (BestKnownTC->isFixed () || VF.isScalable ())) {
4773
4771
// Re-evaluate trip counts and VFs to be in the same numerical space.
4774
- unsigned AvailableTC = estimateElementCount (*BestKnownTC, VScaleForTuning);
4775
- unsigned EstimatedVF = estimateElementCount (VF, VScaleForTuning);
4772
+ unsigned AvailableTC =
4773
+ estimateElementCount (*BestKnownTC, CM.getVScaleForTuning ());
4774
+ unsigned EstimatedVF = estimateElementCount (VF, CM.getVScaleForTuning ());
4776
4775
4777
4776
// At least one iteration must be scalar when this constraint holds. So the
4778
4777
// maximum available iterations for interleaving is one less.
4779
- if (requiresScalarEpilogue (VF.isVector ()))
4778
+ if (CM. requiresScalarEpilogue (VF.isVector ()))
4780
4779
--AvailableTC;
4781
4780
4782
4781
unsigned InterleaveCountLB = bit_floor (std::max (
4783
4782
1u , std::min (AvailableTC / (EstimatedVF * 2 ), MaxInterleaveCount)));
4784
4783
4785
- if (getSmallConstantTripCount (PSE.getSE (), TheLoop ).isNonZero ()) {
4784
+ if (getSmallConstantTripCount (PSE.getSE (), OrigLoop ).isNonZero ()) {
4786
4785
// If the best known trip count is exact, we select between two
4787
4786
// prospective ICs, where
4788
4787
//
@@ -4843,7 +4842,7 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4843
4842
// vectorized the loop we will have done the runtime check and so interleaving
4844
4843
// won't require further checks.
4845
4844
bool ScalarInterleavingRequiresPredication =
4846
- (VF.isScalar () && any_of (TheLoop ->blocks (), [this ](BasicBlock *BB) {
4845
+ (VF.isScalar () && any_of (OrigLoop ->blocks (), [this ](BasicBlock *BB) {
4847
4846
return Legal->blockNeedsPredication (BB);
4848
4847
}));
4849
4848
bool ScalarInterleavingRequiresRuntimePointerCheck =
@@ -4866,8 +4865,39 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4866
4865
4867
4866
// Interleave until store/load ports (estimated by max interleave count) are
4868
4867
// saturated.
4869
- unsigned NumStores = Legal->getNumStores ();
4870
- unsigned NumLoads = Legal->getNumLoads ();
4868
+ unsigned NumStores = 0 ;
4869
+ unsigned NumLoads = 0 ;
4870
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4871
+ vp_depth_first_deep (Plan.getVectorLoopRegion ()->getEntry ()))) {
4872
+ for (VPRecipeBase &R : *VPBB) {
4873
+ if (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(&R)) {
4874
+ NumLoads++;
4875
+ continue ;
4876
+ }
4877
+ if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(&R)) {
4878
+ NumStores++;
4879
+ continue ;
4880
+ }
4881
+
4882
+ if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R)) {
4883
+ if (unsigned StoreOps = InterleaveR->getNumStoreOperands ())
4884
+ NumStores += StoreOps;
4885
+ else
4886
+ NumLoads += InterleaveR->getNumDefinedValues ();
4887
+ continue ;
4888
+ }
4889
+ if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4890
+ NumLoads += isa<LoadInst>(RepR->getUnderlyingInstr ());
4891
+ NumStores += isa<StoreInst>(RepR->getUnderlyingInstr ());
4892
+ continue ;
4893
+ }
4894
+ if (isa<VPHistogramRecipe>(&R)) {
4895
+ NumLoads++;
4896
+ NumStores++;
4897
+ continue ;
4898
+ }
4899
+ }
4900
+ }
4871
4901
unsigned StoresIC = IC / (NumStores ? NumStores : 1 );
4872
4902
unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1 );
4873
4903
@@ -4877,12 +4907,15 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4877
4907
// do the final reduction after the loop.
4878
4908
bool HasSelectCmpReductions =
4879
4909
HasReductions &&
4880
- any_of (Legal->getReductionVars (), [&](auto &Reduction) -> bool {
4881
- const RecurrenceDescriptor &RdxDesc = Reduction.second ;
4882
- RecurKind RK = RdxDesc.getRecurrenceKind ();
4883
- return RecurrenceDescriptor::isAnyOfRecurrenceKind (RK) ||
4884
- RecurrenceDescriptor::isFindIVRecurrenceKind (RK);
4885
- });
4910
+ any_of (Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis (),
4911
+ [](VPRecipeBase &R) {
4912
+ auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4913
+
4914
+ return RedR && (RecurrenceDescriptor::isAnyOfRecurrenceKind (
4915
+ RedR->getRecurrenceKind ()) ||
4916
+ RecurrenceDescriptor::isFindIVRecurrenceKind (
4917
+ RedR->getRecurrenceKind ()));
4918
+ });
4886
4919
if (HasSelectCmpReductions) {
4887
4920
LLVM_DEBUG (dbgs () << " LV: Not interleaving select-cmp reductions.\n " );
4888
4921
return 1 ;
@@ -4893,12 +4926,14 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4893
4926
// we're interleaving is inside another loop. For tree-wise reductions
4894
4927
// set the limit to 2, and for ordered reductions it's best to disable
4895
4928
// interleaving entirely.
4896
- if (HasReductions && TheLoop ->getLoopDepth () > 1 ) {
4929
+ if (HasReductions && OrigLoop ->getLoopDepth () > 1 ) {
4897
4930
bool HasOrderedReductions =
4898
- any_of (Legal->getReductionVars (), [&](auto &Reduction) -> bool {
4899
- const RecurrenceDescriptor &RdxDesc = Reduction.second ;
4900
- return RdxDesc.isOrdered ();
4901
- });
4931
+ any_of (Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis (),
4932
+ [](VPRecipeBase &R) {
4933
+ auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4934
+
4935
+ return RedR && RedR->isOrdered ();
4936
+ });
4902
4937
if (HasOrderedReductions) {
4903
4938
LLVM_DEBUG (
4904
4939
dbgs () << " LV: Not interleaving scalar ordered reductions.\n " );
@@ -10114,8 +10149,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10114
10149
10115
10150
GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (), CM.CostKind );
10116
10151
if (LVP.hasPlanWithVF (VF.Width )) {
10152
+ VPCostContext CostCtx (CM.TTI , *CM.TLI , CM.Legal ->getWidestInductionType (),
10153
+ CM, CM.CostKind );
10154
+
10117
10155
// Select the interleave count.
10118
- IC = CM .selectInterleaveCount (LVP.getPlanFor (VF.Width ), VF.Width , VF.Cost );
10156
+ IC = LVP .selectInterleaveCount (LVP.getPlanFor (VF.Width ), VF.Width , VF.Cost );
10119
10157
10120
10158
unsigned SelectedIC = std::max (IC, UserIC);
10121
10159
// Optimistically generate runtime checks if they are needed. Drop them if
@@ -10137,8 +10175,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10137
10175
// Check if it is profitable to vectorize with runtime checks.
10138
10176
bool ForceVectorization =
10139
10177
Hints.getForce () == LoopVectorizeHints::FK_Enabled;
10140
- VPCostContext CostCtx (CM.TTI , *CM.TLI , CM.Legal ->getWidestInductionType (),
10141
- CM, CM.CostKind );
10142
10178
if (!ForceVectorization &&
10143
10179
!isOutsideLoopWorkProfitable (Checks, VF, L, PSE, CostCtx,
10144
10180
LVP.getPlanFor (VF.Width ), SEL,
0 commit comments