@@ -955,13 +955,6 @@ class LoopVectorizationCostModel {
955
955
// / 64 bit loop indices.
956
956
std::pair<unsigned , unsigned > getSmallestAndWidestTypes ();
957
957
958
- // / \return The desired interleave count.
959
- // / If interleave count has been specified by metadata it will be returned.
960
- // / Otherwise, the interleave count is computed and returned. VF and LoopCost
961
- // / are the selected vectorization factor and the cost of the selected VF.
962
- unsigned selectInterleaveCount (VPlan &Plan, ElementCount VF,
963
- InstructionCost LoopCost);
964
-
965
958
// / Memory access instruction may be vectorized in more than one way.
966
959
// / Form of instruction after vectorization depends on cost.
967
960
// / This function takes cost-based decisions for Load/Store instructions
@@ -4634,8 +4627,8 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
4634
4627
}
4635
4628
4636
4629
unsigned
4637
- LoopVectorizationCostModel ::selectInterleaveCount (VPlan &Plan, ElementCount VF,
4638
- InstructionCost LoopCost) {
4630
+ LoopVectorizationPlanner ::selectInterleaveCount (VPlan &Plan, ElementCount VF,
4631
+ InstructionCost LoopCost) {
4639
4632
// -- The interleave heuristics --
4640
4633
// We interleave the loop in order to expose ILP and reduce the loop overhead.
4641
4634
// There are many micro-architectural considerations that we can't predict
@@ -4650,11 +4643,11 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4650
4643
// 3. We don't interleave if we think that we will spill registers to memory
4651
4644
// due to the increased register pressure.
4652
4645
4653
- if (!isScalarEpilogueAllowed ())
4646
+ if (!CM. isScalarEpilogueAllowed ())
4654
4647
return 1 ;
4655
4648
4656
- // Do not interleave if EVL is preferred and no User IC is specified.
4657
- if ( foldTailWithEVL ( )) {
4649
+ if ( any_of (Plan. getVectorLoopRegion ()-> getEntryBasicBlock ()-> phis (),
4650
+ IsaPred<VPEVLBasedIVPHIRecipe> )) {
4658
4651
LLVM_DEBUG (dbgs () << " LV: Preference for VP intrinsics indicated. "
4659
4652
" Unroll factor forced to be 1.\n " );
4660
4653
return 1 ;
@@ -4667,15 +4660,20 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4667
4660
// We don't attempt to perform interleaving for loops with uncountable early
4668
4661
// exits because the VPInstruction::AnyOf code cannot currently handle
4669
4662
// multiple parts.
4670
- if (Legal-> hasUncountableEarlyExit ())
4663
+ if (Plan. hasEarlyExit ())
4671
4664
return 1 ;
4672
4665
4673
- const bool HasReductions = !Legal->getReductionVars ().empty ();
4666
+ const bool HasReductions =
4667
+ any_of (Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis (),
4668
+ IsaPred<VPReductionPHIRecipe>);
4674
4669
4675
4670
// If we did not calculate the cost for VF (because the user selected the VF)
4676
4671
// then we calculate the cost of VF here.
4677
4672
if (LoopCost == 0 ) {
4678
- LoopCost = expectedCost (VF);
4673
+ if (VF.isScalar ())
4674
+ LoopCost = CM.expectedCost (VF);
4675
+ else
4676
+ LoopCost = cost (Plan, VF);
4679
4677
assert (LoopCost.isValid () && " Expected to have chosen a VF with valid cost" );
4680
4678
4681
4679
// Loop body is free and there is no need for interleaving.
@@ -4684,7 +4682,7 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4684
4682
}
4685
4683
4686
4684
VPRegisterUsage R =
4687
- calculateRegisterUsageForPlan (Plan, {VF}, TTI, ValuesToIgnore)[0 ];
4685
+ calculateRegisterUsageForPlan (Plan, {VF}, TTI, CM. ValuesToIgnore )[0 ];
4688
4686
// We divide by these constants so assume that we have at least one
4689
4687
// instruction that uses at least one register.
4690
4688
for (auto &Pair : R.MaxLocalUsers ) {
@@ -4745,21 +4743,21 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4745
4743
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4746
4744
}
4747
4745
4748
- unsigned EstimatedVF = getEstimatedRuntimeVF (VF, VScaleForTuning );
4746
+ unsigned EstimatedVF = getEstimatedRuntimeVF (VF, CM. getVScaleForTuning () );
4749
4747
4750
4748
// Try to get the exact trip count, or an estimate based on profiling data or
4751
4749
// ConstantMax from PSE, failing that.
4752
- if (auto BestKnownTC = getSmallBestKnownTC (PSE, TheLoop )) {
4750
+ if (auto BestKnownTC = getSmallBestKnownTC (PSE, OrigLoop )) {
4753
4751
// At least one iteration must be scalar when this constraint holds. So the
4754
4752
// maximum available iterations for interleaving is one less.
4755
- unsigned AvailableTC = requiresScalarEpilogue (VF.isVector ())
4753
+ unsigned AvailableTC = CM. requiresScalarEpilogue (VF.isVector ())
4756
4754
? BestKnownTC->getFixedValue () - 1
4757
4755
: BestKnownTC->getFixedValue ();
4758
4756
4759
4757
unsigned InterleaveCountLB = bit_floor (std::max (
4760
4758
1u , std::min (AvailableTC / (EstimatedVF * 2 ), MaxInterleaveCount)));
4761
4759
4762
- if (getSmallConstantTripCount (PSE.getSE (), TheLoop ).isNonZero ()) {
4760
+ if (getSmallConstantTripCount (PSE.getSE (), OrigLoop ).isNonZero ()) {
4763
4761
// If the best known trip count is exact, we select between two
4764
4762
// prospective ICs, where
4765
4763
//
@@ -4820,7 +4818,7 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4820
4818
// vectorized the loop we will have done the runtime check and so interleaving
4821
4819
// won't require further checks.
4822
4820
bool ScalarInterleavingRequiresPredication =
4823
- (VF.isScalar () && any_of (TheLoop ->blocks (), [this ](BasicBlock *BB) {
4821
+ (VF.isScalar () && any_of (OrigLoop ->blocks (), [this ](BasicBlock *BB) {
4824
4822
return Legal->blockNeedsPredication (BB);
4825
4823
}));
4826
4824
bool ScalarInterleavingRequiresRuntimePointerCheck =
@@ -4843,8 +4841,39 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4843
4841
4844
4842
// Interleave until store/load ports (estimated by max interleave count) are
4845
4843
// saturated.
4846
- unsigned NumStores = Legal->getNumStores ();
4847
- unsigned NumLoads = Legal->getNumLoads ();
4844
+ unsigned NumStores = 0 ;
4845
+ unsigned NumLoads = 0 ;
4846
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4847
+ vp_depth_first_deep (Plan.getVectorLoopRegion ()->getEntry ()))) {
4848
+ for (VPRecipeBase &R : *VPBB) {
4849
+ if (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(&R)) {
4850
+ NumLoads++;
4851
+ continue ;
4852
+ }
4853
+ if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(&R)) {
4854
+ NumStores++;
4855
+ continue ;
4856
+ }
4857
+
4858
+ if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R)) {
4859
+ if (unsigned StoreOps = InterleaveR->getNumStoreOperands ())
4860
+ NumStores += StoreOps;
4861
+ else
4862
+ NumLoads += InterleaveR->getNumDefinedValues ();
4863
+ continue ;
4864
+ }
4865
+ if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4866
+ NumLoads += isa<LoadInst>(RepR->getUnderlyingInstr ());
4867
+ NumStores += isa<StoreInst>(RepR->getUnderlyingInstr ());
4868
+ continue ;
4869
+ }
4870
+ if (isa<VPHistogramRecipe>(&R)) {
4871
+ NumLoads++;
4872
+ NumStores++;
4873
+ continue ;
4874
+ }
4875
+ }
4876
+ }
4848
4877
unsigned StoresIC = IC / (NumStores ? NumStores : 1 );
4849
4878
unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1 );
4850
4879
@@ -4854,12 +4883,15 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4854
4883
// do the final reduction after the loop.
4855
4884
bool HasSelectCmpReductions =
4856
4885
HasReductions &&
4857
- any_of (Legal->getReductionVars (), [&](auto &Reduction) -> bool {
4858
- const RecurrenceDescriptor &RdxDesc = Reduction.second ;
4859
- RecurKind RK = RdxDesc.getRecurrenceKind ();
4860
- return RecurrenceDescriptor::isAnyOfRecurrenceKind (RK) ||
4861
- RecurrenceDescriptor::isFindIVRecurrenceKind (RK);
4862
- });
4886
+ any_of (Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis (),
4887
+ [](VPRecipeBase &R) {
4888
+ auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4889
+
4890
+ return RedR && (RecurrenceDescriptor::isAnyOfRecurrenceKind (
4891
+ RedR->getRecurrenceKind ()) ||
4892
+ RecurrenceDescriptor::isFindIVRecurrenceKind (
4893
+ RedR->getRecurrenceKind ()));
4894
+ });
4863
4895
if (HasSelectCmpReductions) {
4864
4896
LLVM_DEBUG (dbgs () << " LV: Not interleaving select-cmp reductions.\n " );
4865
4897
return 1 ;
@@ -4870,12 +4902,14 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4870
4902
// we're interleaving is inside another loop. For tree-wise reductions
4871
4903
// set the limit to 2, and for ordered reductions it's best to disable
4872
4904
// interleaving entirely.
4873
- if (HasReductions && TheLoop ->getLoopDepth () > 1 ) {
4905
+ if (HasReductions && OrigLoop ->getLoopDepth () > 1 ) {
4874
4906
bool HasOrderedReductions =
4875
- any_of (Legal->getReductionVars (), [&](auto &Reduction) -> bool {
4876
- const RecurrenceDescriptor &RdxDesc = Reduction.second ;
4877
- return RdxDesc.isOrdered ();
4878
- });
4907
+ any_of (Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis (),
4908
+ [](VPRecipeBase &R) {
4909
+ auto *RedR = dyn_cast<VPReductionPHIRecipe>(&R);
4910
+
4911
+ return RedR && RedR->isOrdered ();
4912
+ });
4879
4913
if (HasOrderedReductions) {
4880
4914
LLVM_DEBUG (
4881
4915
dbgs () << " LV: Not interleaving scalar ordered reductions.\n " );
@@ -10089,8 +10123,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10089
10123
10090
10124
GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (), CM.CostKind );
10091
10125
if (LVP.hasPlanWithVF (VF.Width )) {
10126
+ VPCostContext CostCtx (CM.TTI , *CM.TLI , CM.Legal ->getWidestInductionType (),
10127
+ CM, CM.CostKind );
10128
+
10092
10129
// Select the interleave count.
10093
- IC = CM .selectInterleaveCount (LVP.getPlanFor (VF.Width ), VF.Width , VF.Cost );
10130
+ IC = LVP .selectInterleaveCount (LVP.getPlanFor (VF.Width ), VF.Width , VF.Cost );
10094
10131
10095
10132
unsigned SelectedIC = std::max (IC, UserIC);
10096
10133
// Optimistically generate runtime checks if they are needed. Drop them if
@@ -10112,8 +10149,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10112
10149
// Check if it is profitable to vectorize with runtime checks.
10113
10150
bool ForceVectorization =
10114
10151
Hints.getForce () == LoopVectorizeHints::FK_Enabled;
10115
- VPCostContext CostCtx (CM.TTI , *CM.TLI , CM.Legal ->getWidestInductionType (),
10116
- CM, CM.CostKind );
10117
10152
if (!ForceVectorization &&
10118
10153
!isOutsideLoopWorkProfitable (Checks, VF, L, PSE, CostCtx,
10119
10154
LVP.getPlanFor (VF.Width ), SEL,
0 commit comments