[VPlan] Run narrowInterleaveGroups during general VPlan optimizations.

fhahn · fhahn · commit ff7c8164385b · 2025-07-20T13:14:54.000+01:00
Move narrowInterleaveGroups to to general VPlan optimization stage. To do so, narrowInterleaveGroups now has to find a suitable VF where all interleave groups are consecutive and saturate the full vector width. If such a VF is found, the original VPlan is split into 2: a) a new clone which contains all VFs of Plan, except VFToOptimize, and b) the original Plan with VFToOptimize as single VF. The original Plan is then optimized. If a new copy for the other VFs has been created, it is returned and the caller has to add it to the list of candidate plans. Together with llvm#149702, this allows to take the narrowed interleave groups into account when interleaving.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7253,9 +7253,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
   VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());
-  VPlanTransforms::narrowInterleaveGroups(
-      BestVPlan, BestVF,
-      TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
   VPlanTransforms::removeDeadRecipes(BestVPlan);
 
   VPlanTransforms::convertToConcreteRecipes(BestVPlan,
@@ -8364,6 +8361,14 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
           !VPlanTransforms::runPass(VPlanTransforms::tryAddExplicitVectorLength,
                                     *Plan, CM.getMaxSafeElements()))
         break;
+
+      if (auto P = VPlanTransforms::narrowInterleaveGroups(
+              *Plan,
+              TTI.getRegisterBitWidth(
+                  TargetTransformInfo::RGK_FixedWidthVector),
+              SubRange))
+        VPlans.push_back(std::move(P));
+
       assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
       VPlans.push_back(std::move(Plan));
     }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -976,6 +976,8 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
   } else {
     VFxUF.setUnderlyingValue(createStepForVF(Builder, TCTy, State.VF, UF));
   }
+
+  this->UF.setUnderlyingValue(ConstantInt::get(TCTy, UF));
 }
 
 VPIRBasicBlock *VPlan::getExitBlock(BasicBlock *IRBB) const {
@@ -1252,6 +1254,7 @@ VPlan *VPlan::duplicate() {
   }
   Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount;
   Old2NewVPValues[&VF] = &NewPlan->VF;
+  Old2NewVPValues[&UF] = &NewPlan->UF;
   Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;
   if (BackedgeTakenCount) {
     NewPlan->BackedgeTakenCount = new VPValue();
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3895,6 +3895,9 @@ class VPlan {
   /// Represents the vectorization factor of the loop.
   VPValue VF;
 
+  /// Represents the symbolic unroll factor of the loop.
+  VPValue UF;
+
   /// Represents the loop-invariant VF * UF of the vector loop region.
   VPValue VFxUF;
 
@@ -4050,6 +4053,9 @@ class VPlan {
   /// Returns the VF of the vector loop region.
   VPValue &getVF() { return VF; };
 
+  /// Returns the symbolic UF of the vector loop region.
+  VPValue &getSymbolicUF() { return UF; };
+
   /// Returns VF * UF of the vector loop region.
   VPValue &getVFxUF() { return VFxUF; }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3146,19 +3146,20 @@ static bool isAlreadyNarrow(VPValue *VPV) {
   return RepR && RepR->isSingleScalar();
 }
 
-void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
-                                             unsigned VectorRegWidth) {
+std::unique_ptr<VPlan>
+VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, unsigned VectorRegWidth,
+                                        VFRange &Range) {
   using namespace llvm::VPlanPatternMatch;
   VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
-  if (VF.isScalable() || !VectorLoop)
-    return;
+  if (Plan.hasScalableVF() || !VectorLoop)
+    return nullptr;
 
   VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
   Type *CanonicalIVType = CanonicalIV->getScalarType();
   VPTypeAnalysis TypeInfo(CanonicalIVType);
 
-  unsigned FixedVF = VF.getFixedValue();
   SmallVector<VPInterleaveRecipe *> StoreGroups;
+  std::optional<unsigned> VFToOptimize;
   for (auto &R : *VectorLoop->getEntryBasicBlock()) {
     if (isa<VPCanonicalIVPHIRecipe>(&R) ||
         match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
@@ -3173,30 +3174,47 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
     //  * recipes writing to memory except interleave groups
     // Only support plans with a canonical induction phi.
     if (R.isPhi())
-      return;
+      return nullptr;
 
     auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
     if (R.mayWriteToMemory() && !InterleaveR)
-      return;
+      return nullptr;
 
     // Do not narrow interleave groups if there are VectorPointer recipes and
     // the plan was unrolled. The recipe implicitly uses VF from
     // VPTransformState.
     // TODO: Remove restriction once the VF for the VectorPointer offset is
     // modeled explicitly as operand.
     if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
-      return;
+      return nullptr;
 
     // All other ops are allowed, but we reject uses that cannot be converted
     // when checking all allowed consumers (store interleave groups) below.
     if (!InterleaveR)
       continue;
 
-    // Bail out on non-consecutive interleave groups.
-    if (!isConsecutiveInterleaveGroup(InterleaveR, FixedVF, TypeInfo,
-                                      VectorRegWidth))
-      return;
-
+    // Try to find a single VF, where all interleave groups are consecutive and
+    // saturate the full vector width. If we already have a candidate VF, check
+    // if it is applicable for the current InterleaveR, otherwise look for a
+    // suitable VF across the Plans VFs.
+    //
+    if (VFToOptimize) {
+      if (!isConsecutiveInterleaveGroup(InterleaveR, *VFToOptimize, TypeInfo,
+                                        VectorRegWidth))
+        return nullptr;
+    } else {
+      for (ElementCount VF : Plan.vectorFactors()) {
+        if (!VF.isFixed())
+          continue;
+        if (isConsecutiveInterleaveGroup(InterleaveR, VF.getFixedValue(),
+                                         TypeInfo, VectorRegWidth)) {
+          VFToOptimize = VF.getFixedValue();
+          break;
+        }
+      }
+      if (!VFToOptimize)
+        return nullptr;
+    }
     // Skip read interleave groups.
     if (InterleaveR->getStoredValues().empty())
       continue;
@@ -3232,24 +3250,44 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
     auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
         InterleaveR->getStoredValues()[0]->getDefiningRecipe());
     if (!WideMember0)
-      return;
+      return nullptr;
     for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
       auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
       if (!R || R->getOpcode() != WideMember0->getOpcode() ||
           R->getNumOperands() > 2)
-        return;
+        return nullptr;
       if (any_of(enumerate(R->operands()),
                  [WideMember0, Idx = I](const auto &P) {
                    const auto &[OpIdx, OpV] = P;
                    return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
                  }))
-        return;
+        return nullptr;
     }
     StoreGroups.push_back(InterleaveR);
   }
 
   if (StoreGroups.empty())
-    return;
+    return nullptr;
+
+  // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
+  // original Plan into 2: a) a new clone which contains all VFs of Plan, except
+  // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
+  std::unique_ptr<VPlan> NewPlan;
+  if (size(Plan.vectorFactors()) != 1) {
+    NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
+    Plan.setVF(ElementCount::getFixed(*VFToOptimize));
+    bool First = true;
+    for (ElementCount VF : NewPlan->vectorFactors()) {
+      if (VF.isFixed() && VF.getFixedValue() == *VFToOptimize)
+        continue;
+      if (First) {
+        NewPlan->setVF(VF);
+        First = false;
+        continue;
+      }
+      NewPlan->addVF(VF);
+    }
+  }
 
   // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
   auto NarrowOp = [](VPValue *V) -> VPValue * {
@@ -3314,11 +3352,11 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
   // original iteration.
   auto *CanIV = Plan.getCanonicalIV();
   auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
-  Inc->setOperand(1, Plan.getOrAddLiveIn(ConstantInt::get(
-                         CanIV->getScalarType(), 1 * Plan.getUF())));
+  Inc->setOperand(1, &Plan.getSymbolicUF());
   Plan.getVF().replaceAllUsesWith(
       Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
   removeDeadRecipes(Plan);
+  return NewPlan;
 }
 
 /// Add branch weight metadata, if the \p Plan's middle block is terminated by a
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -234,14 +234,19 @@ struct VPlanTransforms {
   /// Add explicit broadcasts for live-ins and VPValues defined in \p Plan's entry block if they are used as vectors.
   static void materializeBroadcasts(VPlan &Plan);
 
-  /// Try to convert a plan with interleave groups with VF elements to a plan
-  /// with the interleave groups replaced by wide loads and stores processing VF
-  /// elements, if all transformed interleave groups access the full vector
-  /// width (checked via \o VectorRegWidth). This effectively is a very simple
-  /// form of loop-aware SLP, where we use interleave groups to identify
-  /// candidates.
-  static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
-                                     unsigned VectorRegWidth);
+  /// Try to find a single VF among \p Plan's VFs for which all interleave
+  /// groups (with VF elements) can be replaced by wide loads ans tores
+  /// processing VF elements, if all transformed interleave groups access the
+  /// full vector width (checked via \o VectorRegWidth). If the transformation
+  /// can be applied, the original \p Plan will be split in 2, if is has
+  /// multiple VFs: a) a new clone which contains all VFs of Plan, except
+  /// VFToOptimize, and b) the original Plan with VFToOptimize as single VF. In
+  /// that case, the new clone is returned.
+  ///
+  /// This effectively is a very simple form of loop-aware SLP, where we use
+  /// interleave groups to identify candidates.
+  static std::unique_ptr<VPlan>
+  narrowInterleaveGroups(VPlan &Plan, unsigned VectorRegWidth, VFRange &Range);
 
   /// Predicate and linearize the control-flow in the only loop region of
   /// \p Plan. If \p FoldTail is true, create a mask guarding the loop
diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll

Original file line number	Diff line number	Diff line change
`@@ -976,6 +976,8 @@ void VPlan::prepareToExecute(Value TripCountV, Value VectorTripCountV,`
`976`	`976`	`} else {`
`977`	`977`	`VFxUF.setUnderlyingValue(createStepForVF(Builder, TCTy, State.VF, UF));`
`978`	`978`	`}`
	`979`	`+`
	`980`	`+ this->UF.setUnderlyingValue(ConstantInt::get(TCTy, UF));`
`979`	`981`	`}`
`980`	`982`
`981`	`983`	`VPIRBasicBlock VPlan::getExitBlock(BasicBlock IRBB) const {`
`@@ -1252,6 +1254,7 @@ VPlan *VPlan::duplicate() {`
`1252`	`1254`	`}`
`1253`	`1255`	`Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount;`
`1254`	`1256`	`Old2NewVPValues[&VF] = &NewPlan->VF;`
	`1257`	`+ Old2NewVPValues[&UF] = &NewPlan->UF;`
`1255`	`1258`	`Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;`
`1256`	`1259`	`if (BackedgeTakenCount) {`
`1257`	`1260`	`NewPlan->BackedgeTakenCount = new VPValue();`