diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h index 765c613b04a44..86eb78dc70372 100644 --- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h +++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h @@ -163,6 +163,9 @@ LLVM_ABI bool computeUnrollCount( TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound); +LLVM_ABI std::optional +canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L, + ScalarEvolution *SE); } // end namespace llvm #endif // LLVM_TRANSFORMS_UTILS_UNROLLLOOP_H diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 90d3d92d6bbf5..f6e557e0564d7 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -25,6 +25,7 @@ #include "llvm/Support/Debug.h" #include "llvm/TargetParser/AArch64TargetParser.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" +#include "llvm/Transforms/Utils/UnrollLoop.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include #include @@ -4787,6 +4788,22 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, if (!L->getExitBlock()) return; + // Check if the loop contains any reductions that could be parallelized when + // unrolling. If so, enable partial unrolling, if the trip count is know to be + // a multiple of 2. + bool HasParellelizableReductions = + L->getNumBlocks() == 1 && + any_of(L->getHeader()->phis(), + [&SE, L](PHINode &Phi) { + return canParallelizeReductionWhenUnrolling(Phi, L, &SE); + }) && + isLoopSizeWithinBudget(L, TTI, 12, nullptr); + if (HasParellelizableReductions && + SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) { + UP.Partial = true; + UP.MaxCount = 4; + } + const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L); if (isa(BTC) || isa(BTC) || (SE.getSmallConstantMaxTripCount(L) > 0 && @@ -4802,6 +4819,11 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, // Limit to loops with trip counts that are cheap to expand. UP.SCEVExpansionBudget = 1; + if (HasParellelizableReductions) { + UP.Runtime = true; + UP.DefaultUnrollRuntimeCount = 4; + } + // Try to unroll small, single block loops, if they have load/store // dependencies, to expose more parallel memory access streams. BasicBlock *Header = L->getHeader(); diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp index 86b268de43cf6..63b311a7b1580 100644 --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -41,6 +41,7 @@ #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -660,6 +661,38 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, OrigPHINode.push_back(cast(I)); } + // Collect phi nodes for reductions for which we can introduce multiple + // parallel reduction phis and compute the final reduction result after the + // loop. This requires a single exit block after unrolling. This is ensured by + // restricting to single-block loops where the unrolled iterations are known + // to not exit. + DenseMap Reductions; + bool CanAddAdditionalAccumulators = + !CompletelyUnroll && L->getNumBlocks() == 1 && + (ULO.Runtime || + (ExitInfos.contains(Header) && ((ExitInfos[Header].TripCount != 0 && + ExitInfos[Header].BreakoutTrip == 0)))); + + // Limit parallelizing reductions to unroll counts of 4 or less for now. + // TODO: The number of parallel reductions should depend on the number of + // execution units. We also don't have to add a parallel reduction phi per + // unrolled iteration, but could for example add a parallel phi for every 2 + // unrolled iterations. + if (CanAddAdditionalAccumulators && ULO.Count <= 4) { + for (PHINode &Phi : Header->phis()) { + auto RdxDesc = canParallelizeReductionWhenUnrolling(Phi, L, SE); + if (!RdxDesc) + continue; + + // Only handle duplicate phis for a single reduction for now. + // TODO: Handle any number of reductions + if (!Reductions.empty()) + continue; + + Reductions[&Phi] = *RdxDesc; + } + } + std::vector Headers; std::vector Latches; Headers.push_back(Header); @@ -710,6 +743,7 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, // latch. This is a reasonable default placement if we don't have block // frequencies, and if we do, well the layout will be adjusted later. auto BlockInsertPt = std::next(LatchBlock->getIterator()); + SmallVector PartialReductions; for (unsigned It = 1; It != ULO.Count; ++It) { SmallVector NewBlocks; SmallDenseMap NewLoops; @@ -733,6 +767,31 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, for (PHINode *OrigPHI : OrigPHINode) { PHINode *NewPHI = cast(VMap[OrigPHI]); Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock); + + // Use cloned phis as parallel phis for partial reductions, which will + // get combined to the final reduction result after the loop. + if (Reductions.contains(OrigPHI)) { + // Collect partial reduction results. + if (PartialReductions.empty()) + PartialReductions.push_back(cast(InVal)); + PartialReductions.push_back(cast(VMap[InVal])); + + // Update the start value for the cloned phis to use the identity + // value for the reduction. + const RecurrenceDescriptor &RdxDesc = Reductions[OrigPHI]; + NewPHI->setIncomingValueForBlock( + L->getLoopPreheader(), + getRecurrenceIdentity(RdxDesc.getRecurrenceKind(), + OrigPHI->getType(), + RdxDesc.getFastMathFlags())); + + // Update NewPHI to use the cloned value for the iteration and move + // to header. + NewPHI->replaceUsesOfWith(InVal, VMap[InVal]); + NewPHI->moveBefore(OrigPHI->getIterator()); + continue; + } + if (Instruction *InValI = dyn_cast(InVal)) if (It > 1 && L->contains(InValI)) InVal = LastValueMap[InValI]; @@ -832,7 +891,11 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader)); PN->eraseFromParent(); } else if (ULO.Count > 1) { + if (Reductions.contains(PN)) + continue; + Value *InVal = PN->removeIncomingValue(LatchBlock, false); + // If this value was defined in the loop, take the value defined by the // last iteration of the loop. if (Instruction *InValI = dyn_cast(InVal)) { @@ -1010,6 +1073,38 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, } } + // If there are partial reductions, create code in the exit block to compute + // the final result and update users of the final result. + if (!PartialReductions.empty()) { + BasicBlock *ExitBlock = L->getExitBlock(); + assert(ExitBlock && + "Can only introduce parallel reduction phis with single exit block"); + assert(Reductions.size() == 1 && + "currently only a single reduction is supported"); + Value *FinalRdxValue = PartialReductions.back(); + Value *RdxResult = nullptr; + for (PHINode &Phi : ExitBlock->phis()) { + if (Phi.getIncomingValueForBlock(L->getLoopLatch()) != FinalRdxValue) + continue; + if (!RdxResult) { + RdxResult = PartialReductions.front(); + IRBuilder Builder(ExitBlock, ExitBlock->getFirstNonPHIIt()); + RecurKind RK = Reductions.begin()->second.getRecurrenceKind(); + for (Instruction *RdxPart : drop_begin(PartialReductions)) { + RdxResult = Builder.CreateBinOp( + (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK), + RdxPart, RdxResult, "bin.rdx"); + } + NeedToFixLCSSA = true; + for (Instruction *RdxPart : PartialReductions) + RdxPart->dropPoisonGeneratingFlags(); + } + + Phi.replaceAllUsesWith(RdxResult); + continue; + } + } + if (DTUToUse) { // Apply updates to the DomTree. DT = &DTU.getDomTree(); @@ -1111,3 +1206,38 @@ MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) { } return nullptr; } + +std::optional +llvm::canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L, + ScalarEvolution *SE) { + RecurrenceDescriptor RedDes; + if (!RecurrenceDescriptor::isReductionPHI(&Phi, L, RedDes, + /*DemandedBits=*/nullptr, + /*AC=*/nullptr, /*DT=*/nullptr, SE)) + return std::nullopt; + RecurKind RK = RedDes.getRecurrenceKind(); + // Skip unsupported reductions. + // TODO: Handle additional reductions, including FP and min-max + // reductions. + if (!RecurrenceDescriptor::isIntegerRecurrenceKind(RK) || + RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) || + RecurrenceDescriptor::isFindIVRecurrenceKind(RK) || + RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) + return std::nullopt; + + // Don't unroll reductions with constant ops; those can be folded to a + // single induction update. + if (any_of(cast(Phi.getIncomingValueForBlock(L->getLoopLatch())) + ->operands(), + IsaPred)) + return std::nullopt; + + BasicBlock *Latch = L->getLoopLatch(); + if (!Latch || + !is_contained( + cast(Phi.getIncomingValueForBlock(Latch))->operands(), + &Phi)) + return std::nullopt; + + return RedDes; +} diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll index 0b78beea54aa9..e219e2720b5db 100644 --- a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll +++ b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll @@ -585,16 +585,34 @@ define i32 @test_add_reduction_unroll_partial(ptr %a, i64 noundef %n) { ; APPLE-NEXT: [[ENTRY:.*]]: ; APPLE-NEXT: br label %[[LOOP:.*]] ; APPLE: [[LOOP]]: -; APPLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[RDX_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[RDX_3:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ] ; APPLE-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ] ; APPLE-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]] ; APPLE-NEXT: [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2 -; APPLE-NEXT: [[RDX_NEXT]] = add nuw nsw i32 [[RDX]], [[TMP0]] -; APPLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; APPLE-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; APPLE-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; APPLE-NEXT: [[RDX_NEXT]] = add i32 [[RDX]], [[TMP0]] +; APPLE-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; APPLE-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]] +; APPLE-NEXT: [[TMP1:%.*]] = load i32, ptr [[GEP_A_1]], align 2 +; APPLE-NEXT: [[RDX_NEXT_1]] = add i32 [[RDX_1]], [[TMP1]] +; APPLE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2 +; APPLE-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_1]] +; APPLE-NEXT: [[TMP2:%.*]] = load i32, ptr [[GEP_A_2]], align 2 +; APPLE-NEXT: [[RDX_NEXT_2]] = add i32 [[RDX_2]], [[TMP2]] +; APPLE-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3 +; APPLE-NEXT: [[GEP_A_3:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_2]] +; APPLE-NEXT: [[TMP3:%.*]] = load i32, ptr [[GEP_A_3]], align 2 +; APPLE-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_3]], [[TMP3]] +; APPLE-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4 +; APPLE-NEXT: [[EC_3:%.*]] = icmp eq i64 [[IV_NEXT_3]], 1024 +; APPLE-NEXT: br i1 [[EC_3]], label %[[EXIT:.*]], label %[[LOOP]] ; APPLE: [[EXIT]]: -; APPLE-NEXT: [[BIN_RDX2:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP]] ] +; APPLE-NEXT: [[RES:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ] +; APPLE-NEXT: [[BIN_RDX:%.*]] = add i32 [[RDX_NEXT_1]], [[RDX_NEXT]] +; APPLE-NEXT: [[BIN_RDX1:%.*]] = add i32 [[RDX_NEXT_2]], [[BIN_RDX]] +; APPLE-NEXT: [[BIN_RDX2:%.*]] = add i32 [[RDX_NEXT_3]], [[BIN_RDX1]] ; APPLE-NEXT: ret i32 [[BIN_RDX2]] ; ; OTHER-LABEL: define i32 @test_add_reduction_unroll_partial( @@ -603,27 +621,33 @@ define i32 @test_add_reduction_unroll_partial(ptr %a, i64 noundef %n) { ; OTHER-NEXT: br label %[[LOOP:.*]] ; OTHER: [[LOOP]]: ; OTHER-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ] -; OTHER-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ] +; OTHER-NEXT: [[RDX_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ] +; OTHER-NEXT: [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ] +; OTHER-NEXT: [[RDX_3:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ] +; OTHER-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ] ; OTHER-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]] ; OTHER-NEXT: [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2 -; OTHER-NEXT: [[RDX_NEXT:%.*]] = add nuw nsw i32 [[RDX]], [[TMP0]] +; OTHER-NEXT: [[RDX_NEXT]] = add i32 [[RDX]], [[TMP0]] ; OTHER-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 ; OTHER-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]] ; OTHER-NEXT: [[TMP1:%.*]] = load i32, ptr [[GEP_A_1]], align 2 -; OTHER-NEXT: [[RDX_2:%.*]] = add nuw nsw i32 [[RDX_NEXT]], [[TMP1]] +; OTHER-NEXT: [[RDX_NEXT_1]] = add i32 [[RDX_1]], [[TMP1]] ; OTHER-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2 ; OTHER-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_1]] ; OTHER-NEXT: [[TMP2:%.*]] = load i32, ptr [[GEP_A_2]], align 2 -; OTHER-NEXT: [[RDX_NEXT_2:%.*]] = add nuw nsw i32 [[RDX_2]], [[TMP2]] +; OTHER-NEXT: [[RDX_NEXT_2]] = add i32 [[RDX_2]], [[TMP2]] ; OTHER-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3 ; OTHER-NEXT: [[GEP_A_3:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_2]] ; OTHER-NEXT: [[TMP3:%.*]] = load i32, ptr [[GEP_A_3]], align 2 -; OTHER-NEXT: [[RDX_NEXT_3]] = add nuw nsw i32 [[RDX_NEXT_2]], [[TMP3]] +; OTHER-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_3]], [[TMP3]] ; OTHER-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4 ; OTHER-NEXT: [[EC_3:%.*]] = icmp eq i64 [[IV_NEXT_3]], 1024 ; OTHER-NEXT: br i1 [[EC_3]], label %[[EXIT:.*]], label %[[LOOP]] ; OTHER: [[EXIT]]: -; OTHER-NEXT: [[BIN_RDX2:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ] +; OTHER-NEXT: [[RES:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ] +; OTHER-NEXT: [[BIN_RDX:%.*]] = add i32 [[RDX_NEXT_1]], [[RDX_NEXT]] +; OTHER-NEXT: [[BIN_RDX1:%.*]] = add i32 [[RDX_NEXT_2]], [[BIN_RDX]] +; OTHER-NEXT: [[BIN_RDX2:%.*]] = add i32 [[RDX_NEXT_3]], [[BIN_RDX1]] ; OTHER-NEXT: ret i32 [[BIN_RDX2]] ; entry: @@ -725,21 +749,42 @@ define i32 @test_add_and_mul_reduction_unroll_partial(ptr %a, i64 noundef %n) { ; APPLE-NEXT: [[ENTRY:.*]]: ; APPLE-NEXT: br label %[[LOOP:.*]] ; APPLE: [[LOOP]]: -; APPLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; APPLE-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ] -; APPLE-NEXT: [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_2_NEXT:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[RDX_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[BIN_RDX3:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[RDX_21:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[RDX_3:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RES_2:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_2_NEXT_3:%.*]], %[[LOOP]] ] ; APPLE-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]] ; APPLE-NEXT: [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2 -; APPLE-NEXT: [[RDX_NEXT]] = add nuw nsw i32 [[RDX]], [[TMP0]] -; APPLE-NEXT: [[RDX_2_NEXT]] = mul i32 [[RDX_2]], [[TMP0]] -; APPLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; APPLE-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; APPLE-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; APPLE-NEXT: [[RES_2]] = add i32 [[RDX]], [[TMP0]] +; APPLE-NEXT: [[RDX_2_NEXT:%.*]] = mul i32 [[RDX_2]], [[TMP0]] +; APPLE-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; APPLE-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]] +; APPLE-NEXT: [[TMP1:%.*]] = load i32, ptr [[GEP_A_1]], align 2 +; APPLE-NEXT: [[BIN_RDX3]] = add i32 [[RDX_1]], [[TMP1]] +; APPLE-NEXT: [[RDX_2_NEXT_1:%.*]] = mul i32 [[RDX_2_NEXT]], [[TMP1]] +; APPLE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2 +; APPLE-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_1]] +; APPLE-NEXT: [[TMP2:%.*]] = load i32, ptr [[GEP_A_2]], align 2 +; APPLE-NEXT: [[RDX_NEXT_2]] = add i32 [[RDX_21]], [[TMP2]] +; APPLE-NEXT: [[RDX_2_NEXT_2:%.*]] = mul i32 [[RDX_2_NEXT_1]], [[TMP2]] +; APPLE-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3 +; APPLE-NEXT: [[GEP_A_3:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_2]] +; APPLE-NEXT: [[TMP3:%.*]] = load i32, ptr [[GEP_A_3]], align 2 +; APPLE-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_3]], [[TMP3]] +; APPLE-NEXT: [[RDX_2_NEXT_3]] = mul i32 [[RDX_2_NEXT_2]], [[TMP3]] +; APPLE-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4 +; APPLE-NEXT: [[EC_3:%.*]] = icmp eq i64 [[IV_NEXT_3]], 1024 +; APPLE-NEXT: br i1 [[EC_3]], label %[[EXIT:.*]], label %[[LOOP]] ; APPLE: [[EXIT]]: -; APPLE-NEXT: [[BIN_RDX3:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP]] ] -; APPLE-NEXT: [[RES_2:%.*]] = phi i32 [ [[RDX_2_NEXT]], %[[LOOP]] ] +; APPLE-NEXT: [[RES_1:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ] +; APPLE-NEXT: [[RES_3:%.*]] = phi i32 [ [[RDX_2_NEXT_3]], %[[LOOP]] ] ; APPLE-NEXT: [[SUM:%.*]] = add i32 [[BIN_RDX3]], [[RES_2]] -; APPLE-NEXT: ret i32 [[SUM]] +; APPLE-NEXT: [[BIN_RDX2:%.*]] = add i32 [[RDX_NEXT_2]], [[SUM]] +; APPLE-NEXT: [[BIN_RDX4:%.*]] = add i32 [[RDX_NEXT_3]], [[BIN_RDX2]] +; APPLE-NEXT: [[SUM1:%.*]] = add i32 [[BIN_RDX4]], [[RES_3]] +; APPLE-NEXT: ret i32 [[SUM1]] ; ; OTHER-LABEL: define i32 @test_add_and_mul_reduction_unroll_partial( ; OTHER-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] { @@ -747,23 +792,25 @@ define i32 @test_add_and_mul_reduction_unroll_partial(ptr %a, i64 noundef %n) { ; OTHER-NEXT: br label %[[LOOP:.*]] ; OTHER: [[LOOP]]: ; OTHER-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ] -; OTHER-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ] +; OTHER-NEXT: [[RDX_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ] +; OTHER-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ] ; OTHER-NEXT: [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_2_NEXT_1:%.*]], %[[LOOP]] ] ; OTHER-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]] ; OTHER-NEXT: [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2 -; OTHER-NEXT: [[RDX_NEXT:%.*]] = add nuw nsw i32 [[RDX]], [[TMP0]] +; OTHER-NEXT: [[RDX_NEXT]] = add i32 [[RDX]], [[TMP0]] ; OTHER-NEXT: [[RDX_2_NEXT:%.*]] = mul i32 [[RDX_2]], [[TMP0]] ; OTHER-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 ; OTHER-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]] ; OTHER-NEXT: [[TMP1:%.*]] = load i32, ptr [[GEP_A_1]], align 2 -; OTHER-NEXT: [[RDX_NEXT_1]] = add nuw nsw i32 [[RDX_NEXT]], [[TMP1]] +; OTHER-NEXT: [[RDX_NEXT_1]] = add i32 [[RDX_1]], [[TMP1]] ; OTHER-NEXT: [[RDX_2_NEXT_1]] = mul i32 [[RDX_2_NEXT]], [[TMP1]] ; OTHER-NEXT: [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2 ; OTHER-NEXT: [[EC_1:%.*]] = icmp eq i64 [[IV_NEXT_1]], 1024 ; OTHER-NEXT: br i1 [[EC_1]], label %[[EXIT:.*]], label %[[LOOP]] ; OTHER: [[EXIT]]: -; OTHER-NEXT: [[BIN_RDX:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ] +; OTHER-NEXT: [[RES_1:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ] ; OTHER-NEXT: [[RES_2:%.*]] = phi i32 [ [[RDX_2_NEXT_1]], %[[LOOP]] ] +; OTHER-NEXT: [[BIN_RDX:%.*]] = add i32 [[RDX_NEXT_1]], [[RDX_NEXT]] ; OTHER-NEXT: [[SUM:%.*]] = add i32 [[BIN_RDX]], [[RES_2]] ; OTHER-NEXT: ret i32 [[SUM]] ; @@ -794,18 +841,72 @@ define i32 @test_add_reduction_runtime(ptr %a, i64 noundef %n) { ; APPLE-LABEL: define i32 @test_add_reduction_runtime( ; APPLE-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] { ; APPLE-NEXT: [[ENTRY:.*]]: +; APPLE-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1 +; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 3 +; APPLE-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 3 +; APPLE-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]] +; APPLE: [[ENTRY_NEW]]: +; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]] ; APPLE-NEXT: br label %[[LOOP:.*]] ; APPLE: [[LOOP]]: -; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP]] ] -; APPLE-NEXT: [[RDX_EPIL:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_EPIL:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[RDX_1:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[RDX_3:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[LOOP]] ] ; APPLE-NEXT: [[GEP_A_EPIL:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_EPIL]] ; APPLE-NEXT: [[TMP6:%.*]] = load i32, ptr [[GEP_A_EPIL]], align 2 -; APPLE-NEXT: [[RDX_NEXT_EPIL]] = add nuw nsw i32 [[RDX_EPIL]], [[TMP6]] -; APPLE-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1 +; APPLE-NEXT: [[RDX_NEXT]] = add i32 [[RDX]], [[TMP6]] +; APPLE-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV_EPIL]], 1 +; APPLE-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]] +; APPLE-NEXT: [[TMP3:%.*]] = load i32, ptr [[GEP_A_1]], align 2 +; APPLE-NEXT: [[RDX_NEXT_1]] = add i32 [[RDX_1]], [[TMP3]] +; APPLE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV_EPIL]], 2 +; APPLE-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_1]] +; APPLE-NEXT: [[TMP4:%.*]] = load i32, ptr [[GEP_A_2]], align 2 +; APPLE-NEXT: [[RDX_NEXT_2]] = add i32 [[RDX_2]], [[TMP4]] +; APPLE-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV_EPIL]], 3 +; APPLE-NEXT: [[GEP_A_3:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_2]] +; APPLE-NEXT: [[TMP5:%.*]] = load i32, ptr [[GEP_A_3]], align 2 +; APPLE-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_3]], [[TMP5]] +; APPLE-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV_EPIL]], 4 +; APPLE-NEXT: [[NITER_NEXT_3]] = add nuw i64 [[NITER]], 4 +; APPLE-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NEXT_3]], [[UNROLL_ITER]] +; APPLE-NEXT: br i1 [[NITER_NCMP_3]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]] +; APPLE: [[EXIT_UNR_LCSSA_LOOPEXIT]]: +; APPLE-NEXT: [[RES_PH_PH:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ] +; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_3]], %[[LOOP]] ] +; APPLE-NEXT: [[RDX_UNR_PH:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ] +; APPLE-NEXT: [[BIN_RDX:%.*]] = add i32 [[RDX_NEXT_1]], [[RDX_NEXT]] +; APPLE-NEXT: [[BIN_RDX2:%.*]] = add i32 [[RDX_NEXT_2]], [[BIN_RDX]] +; APPLE-NEXT: [[BIN_RDX3:%.*]] = add i32 [[RDX_NEXT_3]], [[BIN_RDX2]] +; APPLE-NEXT: br label %[[EXIT_UNR_LCSSA]] +; APPLE: [[EXIT_UNR_LCSSA]]: +; APPLE-NEXT: [[RES_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[BIN_RDX3]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] +; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] +; APPLE-NEXT: [[RDX_UNR:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[BIN_RDX3]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] +; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0 +; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]] +; APPLE: [[LOOP_EPIL_PREHEADER]]: +; APPLE-NEXT: br label %[[LOOP_EPIL:.*]] +; APPLE: [[LOOP_EPIL]]: +; APPLE-NEXT: [[IV_EPIL1:%.*]] = phi i64 [ [[IV_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP_EPIL]] ] +; APPLE-NEXT: [[RDX_EPIL:%.*]] = phi i32 [ [[RDX_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[RDX_NEXT_EPIL:%.*]], %[[LOOP_EPIL]] ] +; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[LOOP_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[LOOP_EPIL]] ] +; APPLE-NEXT: [[GEP_A_EPIL1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_EPIL1]] +; APPLE-NEXT: [[TMP7:%.*]] = load i32, ptr [[GEP_A_EPIL1]], align 2 +; APPLE-NEXT: [[RDX_NEXT_EPIL]] = add nuw nsw i32 [[RDX_EPIL]], [[TMP7]] +; APPLE-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL1]], 1 ; APPLE-NEXT: [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]] -; APPLE-NEXT: br i1 [[EC_EPIL]], label %[[EXIT:.*]], label %[[LOOP]] +; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1 +; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]] +; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[LOOP_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP3:![0-9]+]] +; APPLE: [[EXIT_EPILOG_LCSSA]]: +; APPLE-NEXT: [[RES_PH1:%.*]] = phi i32 [ [[RDX_NEXT_EPIL]], %[[LOOP_EPIL]] ] +; APPLE-NEXT: br label %[[EXIT]] ; APPLE: [[EXIT]]: -; APPLE-NEXT: [[RES:%.*]] = phi i32 [ [[RDX_NEXT_EPIL]], %[[LOOP]] ] +; APPLE-NEXT: [[RES:%.*]] = phi i32 [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ], [ [[RES_PH1]], %[[EXIT_EPILOG_LCSSA]] ] ; APPLE-NEXT: ret i32 [[RES]] ; ; OTHER-LABEL: define i32 @test_add_reduction_runtime( @@ -820,23 +921,26 @@ define i32 @test_add_reduction_runtime(ptr %a, i64 noundef %n) { ; OTHER-NEXT: br label %[[LOOP:.*]] ; OTHER: [[LOOP]]: ; OTHER-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ] -; OTHER-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ] +; OTHER-NEXT: [[RDX_1:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ] +; OTHER-NEXT: [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ] +; OTHER-NEXT: [[RDX_3:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ] +; OTHER-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ] ; OTHER-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[LOOP]] ] ; OTHER-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]] ; OTHER-NEXT: [[TMP2:%.*]] = load i32, ptr [[GEP_A]], align 2 -; OTHER-NEXT: [[RDX_NEXT:%.*]] = add nuw nsw i32 [[RDX]], [[TMP2]] +; OTHER-NEXT: [[RDX_NEXT]] = add i32 [[RDX]], [[TMP2]] ; OTHER-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 ; OTHER-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]] ; OTHER-NEXT: [[TMP3:%.*]] = load i32, ptr [[GEP_A_1]], align 2 -; OTHER-NEXT: [[RDX_2:%.*]] = add nuw nsw i32 [[RDX_NEXT]], [[TMP3]] +; OTHER-NEXT: [[RDX_NEXT_1]] = add i32 [[RDX_1]], [[TMP3]] ; OTHER-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2 ; OTHER-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_1]] ; OTHER-NEXT: [[TMP4:%.*]] = load i32, ptr [[GEP_A_2]], align 2 -; OTHER-NEXT: [[RDX_NEXT_2:%.*]] = add nuw nsw i32 [[RDX_2]], [[TMP4]] +; OTHER-NEXT: [[RDX_NEXT_2]] = add i32 [[RDX_2]], [[TMP4]] ; OTHER-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3 ; OTHER-NEXT: [[GEP_A_3:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_2]] ; OTHER-NEXT: [[TMP5:%.*]] = load i32, ptr [[GEP_A_3]], align 2 -; OTHER-NEXT: [[RDX_NEXT_3]] = add nuw nsw i32 [[RDX_NEXT_2]], [[TMP5]] +; OTHER-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_3]], [[TMP5]] ; OTHER-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4 ; OTHER-NEXT: [[NITER_NEXT_3]] = add i64 [[NITER]], 4 ; OTHER-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NEXT_3]], [[UNROLL_ITER]] @@ -845,11 +949,14 @@ define i32 @test_add_reduction_runtime(ptr %a, i64 noundef %n) { ; OTHER-NEXT: [[RES_PH_PH:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ] ; OTHER-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_3]], %[[LOOP]] ] ; OTHER-NEXT: [[RDX_UNR_PH:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ] +; OTHER-NEXT: [[BIN_RDX:%.*]] = add i32 [[RDX_NEXT_1]], [[RDX_NEXT]] +; OTHER-NEXT: [[BIN_RDX2:%.*]] = add i32 [[RDX_NEXT_2]], [[BIN_RDX]] +; OTHER-NEXT: [[BIN_RDX3:%.*]] = add i32 [[RDX_NEXT_3]], [[BIN_RDX2]] ; OTHER-NEXT: br label %[[EXIT_UNR_LCSSA]] ; OTHER: [[EXIT_UNR_LCSSA]]: -; OTHER-NEXT: [[RES_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] +; OTHER-NEXT: [[RES_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[BIN_RDX3]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] ; OTHER-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] -; OTHER-NEXT: [[RDX_UNR:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] +; OTHER-NEXT: [[RDX_UNR:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[BIN_RDX3]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] ; OTHER-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0 ; OTHER-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]] ; OTHER: [[LOOP_EPIL_PREHEADER]]: @@ -894,6 +1001,7 @@ exit: ; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]} ; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"} ; APPLE: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]} +; APPLE: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} ;. ; OTHER: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]} ; OTHER: [[META1]] = !{!"llvm.loop.unroll.disable"} diff --git a/llvm/test/Transforms/LoopUnroll/ARM/instr-size-costs.ll b/llvm/test/Transforms/LoopUnroll/ARM/instr-size-costs.ll index 216bf489bc66e..64a2e558634b5 100644 --- a/llvm/test/Transforms/LoopUnroll/ARM/instr-size-costs.ll +++ b/llvm/test/Transforms/LoopUnroll/ARM/instr-size-costs.ll @@ -196,13 +196,14 @@ define i32 @test_i32_select_optsize(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-V8: loop: ; CHECK-V8-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[COUNT_1:%.*]], [[LOOP]] ] ; CHECK-V8-NEXT: [[ACC:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ACC_NEXT_1:%.*]], [[LOOP]] ] +; CHECK-V8-NEXT: [[ACC1:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ACC_NEXT_2:%.*]], [[LOOP]] ] ; CHECK-V8-NEXT: [[ADDR_A:%.*]] = getelementptr i32, ptr [[A:%.*]], i32 [[IV]] ; CHECK-V8-NEXT: [[ADDR_B:%.*]] = getelementptr i32, ptr [[B:%.*]], i32 [[IV]] ; CHECK-V8-NEXT: [[DATA_A:%.*]] = load i32, ptr [[ADDR_A]], align 4 ; CHECK-V8-NEXT: [[DATA_B:%.*]] = load i32, ptr [[ADDR_B]], align 4 ; CHECK-V8-NEXT: [[UGT:%.*]] = icmp ugt i32 [[DATA_A]], [[DATA_B]] ; CHECK-V8-NEXT: [[UMAX:%.*]] = select i1 [[UGT]], i32 [[DATA_A]], i32 [[DATA_B]] -; CHECK-V8-NEXT: [[ACC_NEXT:%.*]] = add i32 [[UMAX]], [[ACC]] +; CHECK-V8-NEXT: [[ACC_NEXT_2]] = add i32 [[UMAX]], [[ACC1]] ; CHECK-V8-NEXT: [[ADDR_C:%.*]] = getelementptr i32, ptr [[C:%.*]], i32 [[IV]] ; CHECK-V8-NEXT: store i32 [[UMAX]], ptr [[ADDR_C]], align 4 ; CHECK-V8-NEXT: [[COUNT:%.*]] = add nuw nsw i32 [[IV]], 1 @@ -212,14 +213,15 @@ define i32 @test_i32_select_optsize(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-V8-NEXT: [[DATA_B_1:%.*]] = load i32, ptr [[ADDR_B_1]], align 4 ; CHECK-V8-NEXT: [[UGT_1:%.*]] = icmp ugt i32 [[DATA_A_1]], [[DATA_B_1]] ; CHECK-V8-NEXT: [[UMAX_1:%.*]] = select i1 [[UGT_1]], i32 [[DATA_A_1]], i32 [[DATA_B_1]] -; CHECK-V8-NEXT: [[ACC_NEXT_1]] = add i32 [[UMAX_1]], [[ACC_NEXT]] +; CHECK-V8-NEXT: [[ACC_NEXT_1]] = add i32 [[UMAX_1]], [[ACC]] ; CHECK-V8-NEXT: [[ADDR_C_1:%.*]] = getelementptr i32, ptr [[C]], i32 [[COUNT]] ; CHECK-V8-NEXT: store i32 [[UMAX_1]], ptr [[ADDR_C_1]], align 4 ; CHECK-V8-NEXT: [[COUNT_1]] = add nuw nsw i32 [[IV]], 2 ; CHECK-V8-NEXT: [[END_1:%.*]] = icmp ne i32 [[COUNT_1]], 100 ; CHECK-V8-NEXT: br i1 [[END_1]], label [[LOOP]], label [[EXIT:%.*]] ; CHECK-V8: exit: -; CHECK-V8-NEXT: [[ACC_NEXT_LCSSA:%.*]] = phi i32 [ [[ACC_NEXT_1]], [[LOOP]] ] +; CHECK-V8-NEXT: [[ACC_NEXT_LCSSA1:%.*]] = phi i32 [ [[ACC_NEXT_1]], [[LOOP]] ] +; CHECK-V8-NEXT: [[ACC_NEXT_LCSSA:%.*]] = add i32 [[ACC_NEXT_1]], [[ACC_NEXT_2]] ; CHECK-V8-NEXT: ret i32 [[ACC_NEXT_LCSSA]] ; entry: @@ -252,13 +254,14 @@ define i32 @test_i32_select_minsize(ptr %a, ptr %b, ptr %c) #1 { ; CHECK-V8: loop: ; CHECK-V8-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[COUNT_1:%.*]], [[LOOP]] ] ; CHECK-V8-NEXT: [[ACC:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ACC_NEXT_1:%.*]], [[LOOP]] ] +; CHECK-V8-NEXT: [[ACC1:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ACC_NEXT_2:%.*]], [[LOOP]] ] ; CHECK-V8-NEXT: [[ADDR_A:%.*]] = getelementptr i32, ptr [[A:%.*]], i32 [[IV]] ; CHECK-V8-NEXT: [[ADDR_B:%.*]] = getelementptr i32, ptr [[B:%.*]], i32 [[IV]] ; CHECK-V8-NEXT: [[DATA_A:%.*]] = load i32, ptr [[ADDR_A]], align 4 ; CHECK-V8-NEXT: [[DATA_B:%.*]] = load i32, ptr [[ADDR_B]], align 4 ; CHECK-V8-NEXT: [[UGT:%.*]] = icmp ugt i32 [[DATA_A]], [[DATA_B]] ; CHECK-V8-NEXT: [[UMAX:%.*]] = select i1 [[UGT]], i32 [[DATA_A]], i32 [[DATA_B]] -; CHECK-V8-NEXT: [[ACC_NEXT:%.*]] = add i32 [[UMAX]], [[ACC]] +; CHECK-V8-NEXT: [[ACC_NEXT_2]] = add i32 [[UMAX]], [[ACC1]] ; CHECK-V8-NEXT: [[ADDR_C:%.*]] = getelementptr i32, ptr [[C:%.*]], i32 [[IV]] ; CHECK-V8-NEXT: store i32 [[UMAX]], ptr [[ADDR_C]], align 4 ; CHECK-V8-NEXT: [[COUNT:%.*]] = add nuw nsw i32 [[IV]], 1 @@ -268,14 +271,15 @@ define i32 @test_i32_select_minsize(ptr %a, ptr %b, ptr %c) #1 { ; CHECK-V8-NEXT: [[DATA_B_1:%.*]] = load i32, ptr [[ADDR_B_1]], align 4 ; CHECK-V8-NEXT: [[UGT_1:%.*]] = icmp ugt i32 [[DATA_A_1]], [[DATA_B_1]] ; CHECK-V8-NEXT: [[UMAX_1:%.*]] = select i1 [[UGT_1]], i32 [[DATA_A_1]], i32 [[DATA_B_1]] -; CHECK-V8-NEXT: [[ACC_NEXT_1]] = add i32 [[UMAX_1]], [[ACC_NEXT]] +; CHECK-V8-NEXT: [[ACC_NEXT_1]] = add i32 [[UMAX_1]], [[ACC]] ; CHECK-V8-NEXT: [[ADDR_C_1:%.*]] = getelementptr i32, ptr [[C]], i32 [[COUNT]] ; CHECK-V8-NEXT: store i32 [[UMAX_1]], ptr [[ADDR_C_1]], align 4 ; CHECK-V8-NEXT: [[COUNT_1]] = add nuw nsw i32 [[IV]], 2 ; CHECK-V8-NEXT: [[END_1:%.*]] = icmp ne i32 [[COUNT_1]], 100 ; CHECK-V8-NEXT: br i1 [[END_1]], label [[LOOP]], label [[EXIT:%.*]] ; CHECK-V8: exit: -; CHECK-V8-NEXT: [[ACC_NEXT_LCSSA:%.*]] = phi i32 [ [[ACC_NEXT_1]], [[LOOP]] ] +; CHECK-V8-NEXT: [[ACC_NEXT_LCSSA1:%.*]] = phi i32 [ [[ACC_NEXT_1]], [[LOOP]] ] +; CHECK-V8-NEXT: [[ACC_NEXT_LCSSA:%.*]] = add i32 [[ACC_NEXT_1]], [[ACC_NEXT_2]] ; CHECK-V8-NEXT: ret i32 [[ACC_NEXT_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll b/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll index 953dc278b6644..17cc1f31546a3 100644 --- a/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll +++ b/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll @@ -8,27 +8,33 @@ define i32 @test_add(ptr %src, i64 %n, i32 %start) { ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX_NEXT_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX_3:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_24:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1 -; CHECK-NEXT: [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]] +; CHECK-NEXT: [[RDX_NEXT]] = add i32 [[RDX]], [[L]] ; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2 ; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]] ; CHECK-NEXT: [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1 -; CHECK-NEXT: [[RDX_NEXT_1:%.*]] = add i32 [[RDX_NEXT]], [[L_1]] +; CHECK-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_1]], [[L_1]] ; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3 ; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]] ; CHECK-NEXT: [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1 -; CHECK-NEXT: [[RDX_NEXT_2:%.*]] = add i32 [[RDX_NEXT_1]], [[L_2]] +; CHECK-NEXT: [[RDX_NEXT_2]] = add i32 [[RDX_NEXT_1]], [[L_2]] ; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4 ; CHECK-NEXT: [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]] ; CHECK-NEXT: [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1 -; CHECK-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_NEXT_2]], [[L_24]] +; CHECK-NEXT: [[RDX_NEXT_24]] = add i32 [[RDX_3]], [[L_24]] ; CHECK-NEXT: [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000 ; CHECK-NEXT: br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX_NEXT_LCSSA1:%.*]] = phi i32 [ [[RDX_NEXT_24]], %[[LOOP]] ] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add i32 [[RDX_NEXT_3]], [[RDX_NEXT]] +; CHECK-NEXT: [[BIN_RDX1:%.*]] = add i32 [[RDX_NEXT_2]], [[BIN_RDX]] +; CHECK-NEXT: [[RDX_NEXT_LCSSA:%.*]] = add i32 [[RDX_NEXT_24]], [[BIN_RDX1]] ; CHECK-NEXT: ret i32 [[RDX_NEXT_LCSSA]] ; entry: @@ -203,33 +209,39 @@ define i32 @test_add_and_mul_reduction(ptr %src, i64 %n, i32 %start) { ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RDX_1:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_1_NEXT_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX_1_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_1_NEXT_1:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX_1_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_1_NEXT_2:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX_1_3:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_1_NEXT_24:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX_1:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_1_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[RDX_2:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_2_NEXT_3:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1 -; CHECK-NEXT: [[RDX_1_NEXT:%.*]] = add i32 [[RDX_1]], [[L]] +; CHECK-NEXT: [[RDX_1_NEXT]] = add i32 [[RDX_1]], [[L]] ; CHECK-NEXT: [[RDX_2_NEXT:%.*]] = mul i32 [[RDX_2]], [[L]] ; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2 ; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]] ; CHECK-NEXT: [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1 -; CHECK-NEXT: [[RDX_1_2:%.*]] = add i32 [[RDX_1_NEXT]], [[L_1]] +; CHECK-NEXT: [[RDX_1_NEXT_1]] = add i32 [[RDX_1_1]], [[L_1]] ; CHECK-NEXT: [[RDX_2_2:%.*]] = mul i32 [[RDX_2_NEXT]], [[L_1]] ; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3 ; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]] ; CHECK-NEXT: [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1 -; CHECK-NEXT: [[RDX_1_NEXT_2:%.*]] = add i32 [[RDX_1_2]], [[L_2]] +; CHECK-NEXT: [[RDX_1_NEXT_2]] = add i32 [[RDX_1_2]], [[L_2]] ; CHECK-NEXT: [[RDX_2_NEXT_2:%.*]] = mul i32 [[RDX_2_2]], [[L_2]] ; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4 ; CHECK-NEXT: [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]] ; CHECK-NEXT: [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1 -; CHECK-NEXT: [[RDX_1_NEXT_3]] = add i32 [[RDX_1_NEXT_2]], [[L_24]] +; CHECK-NEXT: [[RDX_1_NEXT_24]] = add i32 [[RDX_1_3]], [[L_24]] ; CHECK-NEXT: [[RDX_2_NEXT_3]] = mul i32 [[RDX_2_NEXT_2]], [[L_24]] ; CHECK-NEXT: [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000 ; CHECK-NEXT: br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[RDX_1_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_1_NEXT_3]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX_1_NEXT_LCSSA1:%.*]] = phi i32 [ [[RDX_1_NEXT_24]], %[[LOOP]] ] ; CHECK-NEXT: [[BIN_RDX5:%.*]] = phi i32 [ [[RDX_2_NEXT_3]], %[[LOOP]] ] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add i32 [[RDX_1_NEXT_1]], [[RDX_1_NEXT]] +; CHECK-NEXT: [[BIN_RDX1:%.*]] = add i32 [[RDX_1_NEXT_2]], [[BIN_RDX]] +; CHECK-NEXT: [[RDX_1_NEXT_LCSSA:%.*]] = add i32 [[RDX_1_NEXT_24]], [[BIN_RDX1]] ; CHECK-NEXT: [[RES:%.*]] = add i32 [[RDX_1_NEXT_LCSSA]], [[BIN_RDX5]] ; CHECK-NEXT: ret i32 [[RES]] ; diff --git a/llvm/test/Transforms/LoopUnroll/runtime-loop5.ll b/llvm/test/Transforms/LoopUnroll/runtime-loop5.ll index fa9f9026d524c..a53e28ce106d9 100644 --- a/llvm/test/Transforms/LoopUnroll/runtime-loop5.ll +++ b/llvm/test/Transforms/LoopUnroll/runtime-loop5.ll @@ -75,23 +75,26 @@ define i3 @test(ptr %a, i3 %n) { ; UNROLL-4-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-4: for.body: ; UNROLL-4-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_BODY]] ] -; UNROLL-4-NEXT: [[SUM_02:%.*]] = phi i3 [ 0, [[FOR_BODY_PREHEADER_NEW]] ], [ [[ADD_3:%.*]], [[FOR_BODY]] ] +; UNROLL-4-NEXT: [[SUM_02_1:%.*]] = phi i3 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER_NEW]] ] +; UNROLL-4-NEXT: [[ADD_1:%.*]] = phi i3 [ [[ADD_2:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER_NEW]] ] +; UNROLL-4-NEXT: [[SUM_02_3:%.*]] = phi i3 [ [[ADD_3:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER_NEW]] ] +; UNROLL-4-NEXT: [[SUM_02:%.*]] = phi i3 [ [[ADD1:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER_NEW]] ] ; UNROLL-4-NEXT: [[NITER:%.*]] = phi i3 [ 0, [[FOR_BODY_PREHEADER_NEW]] ], [ [[NITER_NEXT_3:%.*]], [[FOR_BODY]] ] ; UNROLL-4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i3, ptr [[A:%.*]], i64 [[INDVARS_IV]] ; UNROLL-4-NEXT: [[TMP2:%.*]] = load i3, ptr [[ARRAYIDX]], align 1 -; UNROLL-4-NEXT: [[ADD:%.*]] = add nsw i3 [[TMP2]], [[SUM_02]] +; UNROLL-4-NEXT: [[ADD1]] = add i3 [[TMP2]], [[SUM_02]] ; UNROLL-4-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; UNROLL-4-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i3, ptr [[A]], i64 [[INDVARS_IV_NEXT]] ; UNROLL-4-NEXT: [[TMP3:%.*]] = load i3, ptr [[ARRAYIDX_1]], align 1 -; UNROLL-4-NEXT: [[ADD_1:%.*]] = add nsw i3 [[TMP3]], [[ADD]] +; UNROLL-4-NEXT: [[ADD]] = add i3 [[TMP3]], [[SUM_02_1]] ; UNROLL-4-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2 ; UNROLL-4-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i3, ptr [[A]], i64 [[INDVARS_IV_NEXT_1]] ; UNROLL-4-NEXT: [[TMP4:%.*]] = load i3, ptr [[ARRAYIDX_2]], align 1 -; UNROLL-4-NEXT: [[ADD_2:%.*]] = add nsw i3 [[TMP4]], [[ADD_1]] +; UNROLL-4-NEXT: [[ADD_2]] = add i3 [[TMP4]], [[ADD_1]] ; UNROLL-4-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3 ; UNROLL-4-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i3, ptr [[A]], i64 [[INDVARS_IV_NEXT_2]] ; UNROLL-4-NEXT: [[TMP5:%.*]] = load i3, ptr [[ARRAYIDX_3]], align 1 -; UNROLL-4-NEXT: [[ADD_3]] = add nsw i3 [[TMP5]], [[ADD_2]] +; UNROLL-4-NEXT: [[ADD_3]] = add i3 [[TMP5]], [[SUM_02_3]] ; UNROLL-4-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4 ; UNROLL-4-NEXT: [[NITER_NEXT_3]] = add i3 [[NITER]], -4 ; UNROLL-4-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i3 [[NITER_NEXT_3]], [[UNROLL_ITER]] @@ -100,11 +103,14 @@ define i3 @test(ptr %a, i3 %n) { ; UNROLL-4-NEXT: [[ADD_LCSSA_PH_PH:%.*]] = phi i3 [ [[ADD_3]], [[FOR_BODY]] ] ; UNROLL-4-NEXT: [[INDVARS_IV_UNR_PH:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3]], [[FOR_BODY]] ] ; UNROLL-4-NEXT: [[SUM_02_UNR_PH:%.*]] = phi i3 [ [[ADD_3]], [[FOR_BODY]] ] +; UNROLL-4-NEXT: [[BIN_RDX:%.*]] = add i3 [[ADD]], [[ADD1]] +; UNROLL-4-NEXT: [[BIN_RDX2:%.*]] = add i3 [[ADD_2]], [[BIN_RDX]] +; UNROLL-4-NEXT: [[BIN_RDX3:%.*]] = add i3 [[ADD_3]], [[BIN_RDX2]] ; UNROLL-4-NEXT: br label [[FOR_END_LOOPEXIT_UNR_LCSSA]] ; UNROLL-4: for.end.loopexit.unr-lcssa: -; UNROLL-4-NEXT: [[ADD_LCSSA_PH:%.*]] = phi i3 [ poison, [[FOR_BODY_PREHEADER]] ], [ [[ADD_LCSSA_PH_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] +; UNROLL-4-NEXT: [[ADD_LCSSA_PH:%.*]] = phi i3 [ poison, [[FOR_BODY_PREHEADER]] ], [ [[BIN_RDX3]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] ; UNROLL-4-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_UNR_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] -; UNROLL-4-NEXT: [[SUM_02_UNR:%.*]] = phi i3 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[SUM_02_UNR_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] +; UNROLL-4-NEXT: [[SUM_02_UNR:%.*]] = phi i3 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[BIN_RDX3]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] ; UNROLL-4-NEXT: [[LCMP_MOD:%.*]] = icmp ne i3 [[XTRAITER]], 0 ; UNROLL-4-NEXT: br i1 [[LCMP_MOD]], label [[FOR_BODY_EPIL_PREHEADER:%.*]], label [[FOR_END_LOOPEXIT:%.*]] ; UNROLL-4: for.body.epil.preheader: diff --git a/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll b/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll index 89f06ad373aa9..34cd6e69eb567 100644 --- a/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll +++ b/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll @@ -14,15 +14,16 @@ define i32 @test_add_reduction(ptr %a, i64 %n) { ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX_1:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[GEP_A]], align 2 -; CHECK-NEXT: [[RDX_NEXT:%.*]] = add nuw nsw i32 [[RDX]], [[TMP2]] +; CHECK-NEXT: [[RDX_NEXT]] = add i32 [[RDX]], [[TMP2]] ; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]] ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[GEP_A_1]], align 2 -; CHECK-NEXT: [[RDX_NEXT_1]] = add nuw nsw i32 [[RDX_NEXT]], [[TMP3]] +; CHECK-NEXT: [[RDX_NEXT_1]] = add i32 [[RDX_1]], [[TMP3]] ; CHECK-NEXT: [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2 ; CHECK-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2 ; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]] @@ -31,11 +32,12 @@ define i32 @test_add_reduction(ptr %a, i64 %n) { ; CHECK-NEXT: [[RES_PH_PH:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ] ; CHECK-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_1]], %[[LOOP]] ] ; CHECK-NEXT: [[RDX_UNR_PH:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add i32 [[RDX_NEXT_1]], [[RDX_NEXT]] ; CHECK-NEXT: br label %[[EXIT_UNR_LCSSA]] ; CHECK: [[EXIT_UNR_LCSSA]]: -; CHECK-NEXT: [[RES_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[RES_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[BIN_RDX]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] ; CHECK-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] -; CHECK-NEXT: [[RDX_UNR:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[RDX_UNR:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[BIN_RDX]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] ; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0 ; CHECK-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]] ; CHECK: [[LOOP_EPIL_PREHEADER]]: diff --git a/llvm/test/Transforms/LoopUnroll/runtime-unroll-remainder.ll b/llvm/test/Transforms/LoopUnroll/runtime-unroll-remainder.ll index a3cfeac1f86be..2f8b5729316f1 100644 --- a/llvm/test/Transforms/LoopUnroll/runtime-unroll-remainder.ll +++ b/llvm/test/Transforms/LoopUnroll/runtime-unroll-remainder.ll @@ -16,9 +16,12 @@ define i32 @unroll(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) ; CHECK-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup.loopexit.unr-lcssa.loopexit: +; CHECK-NEXT: [[BIN_RDX:%.*]] = add i32 [[ADD_5:%.*]], [[ADD_2:%.*]] +; CHECK-NEXT: [[BIN_RDX2:%.*]] = add i32 [[ADD_4:%.*]], [[BIN_RDX]] +; CHECK-NEXT: [[ADD_3:%.*]] = add i32 [[ADD_6:%.*]], [[BIN_RDX2]] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]] ; CHECK: for.cond.cleanup.loopexit.unr-lcssa: -; CHECK-NEXT: [[ADD_LCSSA_PH:%.*]] = phi i32 [ poison, [[FOR_BODY_LR_PH]] ], [ [[ADD_3:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT:%.*]] ] +; CHECK-NEXT: [[ADD_LCSSA_PH:%.*]] = phi i32 [ poison, [[FOR_BODY_LR_PH]] ], [ [[ADD_3]], [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT:%.*]] ] ; CHECK-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] ; CHECK-NEXT: [[C_010_UNR:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[ADD_3]], [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] ; CHECK-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 @@ -64,35 +67,38 @@ define i32 @unroll(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) ; CHECK-NEXT: ret i32 [[C_0_LCSSA]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH_NEW]] ], [ [[INDVARS_IV_NEXT_3]], [[FOR_BODY]] ] -; CHECK-NEXT: [[C_010:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH_NEW]] ], [ [[ADD_3]], [[FOR_BODY]] ] +; CHECK-NEXT: [[C_010_1:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH_NEW]] ], [ [[ADD_5]], [[FOR_BODY]] ] +; CHECK-NEXT: [[C_010_2:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH_NEW]] ], [ [[ADD_4]], [[FOR_BODY]] ] +; CHECK-NEXT: [[C_010_3:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH_NEW]] ], [ [[ADD_6]], [[FOR_BODY]] ] +; CHECK-NEXT: [[C_010:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH_NEW]] ], [ [[ADD_2]], [[FOR_BODY]] ] ; CHECK-NEXT: [[NITER:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH_NEW]] ], [ [[NITER_NEXT_3:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[MUL]], [[C_010]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX2_2]], align 4 +; CHECK-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[ADD_2]] = add i32 [[MUL_2]], [[C_010]] ; CHECK-NEXT: [[INDVARS_IV_NEXT:%.*]] = or disjoint i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS_IV_NEXT]] ; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX_1]], align 4 ; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS_IV_NEXT]] ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX2_1]], align 4 ; CHECK-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[MUL_1]], [[ADD]] +; CHECK-NEXT: [[ADD_5]] = add i32 [[MUL_1]], [[C_010_1]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or disjoint i64 [[INDVARS_IV]], 2 ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS_IV_NEXT_1]] -; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX_2]], align 4 -; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS_IV_NEXT_1]] -; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX2_2]], align 4 -; CHECK-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[TMP12]], [[TMP11]] -; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[MUL_2]], [[ADD_1]] +; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX_2]], align 4 +; CHECK-NEXT: [[ARRAYIDX2_4:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS_IV_NEXT_1]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX2_4]], align 4 +; CHECK-NEXT: [[MUL_4:%.*]] = mul nsw i32 [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[ADD_4]] = add i32 [[MUL_4]], [[C_010_2]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or disjoint i64 [[INDVARS_IV]], 3 ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS_IV_NEXT_2]] ; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX_3]], align 4 ; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS_IV_NEXT_2]] ; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX2_3]], align 4 ; CHECK-NEXT: [[MUL_3:%.*]] = mul nsw i32 [[TMP14]], [[TMP13]] -; CHECK-NEXT: [[ADD_3]] = add nsw i32 [[MUL_3]], [[ADD_2]] +; CHECK-NEXT: [[ADD_6]] = add i32 [[MUL_3]], [[C_010_3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4 ; CHECK-NEXT: [[NITER_NEXT_3]] = add i64 [[NITER]], 4 ; CHECK-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NEXT_3]], [[UNROLL_ITER]] diff --git a/llvm/test/Transforms/PhaseOrdering/SystemZ/sub-xor.ll b/llvm/test/Transforms/PhaseOrdering/SystemZ/sub-xor.ll index 5386bf939918a..bb39a4cc86d0d 100644 --- a/llvm/test/Transforms/PhaseOrdering/SystemZ/sub-xor.ll +++ b/llvm/test/Transforms/PhaseOrdering/SystemZ/sub-xor.ll @@ -18,15 +18,15 @@ define dso_local zeroext i32 @foo(ptr noundef %a) #0 { ; CHECK-NEXT: br label [[FOR_BODY4:%.*]] ; CHECK: for.body4: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_7:%.*]], [[FOR_BODY4]] ] -; CHECK-NEXT: [[SUM_11:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD_7:%.*]], [[FOR_BODY4]] ] +; CHECK-NEXT: [[SUM_12:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD_8:%.*]], [[FOR_BODY4]] ] ; CHECK-NEXT: [[IDX_NEG:%.*]] = sub nsw i64 0, [[INDVARS_IV]] ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[IDX_NEG]] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ADD_PTR]], align 4, !tbaa [[TBAA3:![0-9]+]] -; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP0]], [[SUM_11]] +; CHECK-NEXT: [[ADD_7:%.*]] = add i32 [[TMP0]], [[SUM_12]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_NEG:%.*]] = xor i64 [[INDVARS_IV]], -1 ; CHECK-NEXT: [[ADD_PTR_110:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_NEG]] -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ADD_PTR_110]], align 4, !tbaa [[TBAA3]] -; CHECK-NEXT: [[ADD_111:%.*]] = add i32 [[TMP1]], [[ADD]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ADD_PTR_110]], align 4, !tbaa [[TBAA3]] +; CHECK-NEXT: [[ADD_111:%.*]] = add i32 [[TMP7]], [[ADD_7]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_112_NEG:%.*]] = sub nuw nsw i64 -2, [[INDVARS_IV]] ; CHECK-NEXT: [[ADD_PTR_217:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_112_NEG]] ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ADD_PTR_217]], align 4, !tbaa [[TBAA3]] @@ -46,28 +46,28 @@ define dso_local zeroext i32 @foo(ptr noundef %a) #0 { ; CHECK-NEXT: [[INDVARS_IV_NEXT_5_NEG:%.*]] = sub nuw nsw i64 -6, [[INDVARS_IV]] ; CHECK-NEXT: [[ADD_PTR_6:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_5_NEG]] ; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[ADD_PTR_6]], align 4, !tbaa [[TBAA3]] -; CHECK-NEXT: [[ADD_6:%.*]] = add i32 [[TMP6]], [[ADD_5]] +; CHECK-NEXT: [[SUM_11:%.*]] = add i32 [[TMP6]], [[ADD_5]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_6_NEG:%.*]] = sub nuw nsw i64 -7, [[INDVARS_IV]] ; CHECK-NEXT: [[ADD_PTR_7:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_6_NEG]] -; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ADD_PTR_7]], align 4, !tbaa [[TBAA3]] -; CHECK-NEXT: [[ADD_7]] = add i32 [[TMP7]], [[ADD_6]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ADD_PTR_7]], align 4, !tbaa [[TBAA3]] +; CHECK-NEXT: [[ADD_8]] = add i32 [[TMP1]], [[SUM_11]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8 ; CHECK-NEXT: [[EXITCOND_NOT_7:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_7]], 32 ; CHECK-NEXT: br i1 [[EXITCOND_NOT_7]], label [[FOR_BODY4_1:%.*]], label [[FOR_BODY4]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: for.body4.1: ; CHECK-NEXT: [[INDVARS_IV_1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1_7:%.*]], [[FOR_BODY4_1]] ], [ 0, [[FOR_BODY4]] ] -; CHECK-NEXT: [[SUM_11_1:%.*]] = phi i32 [ [[ADD_1_7:%.*]], [[FOR_BODY4_1]] ], [ [[ADD_7]], [[FOR_BODY4]] ] +; CHECK-NEXT: [[SUM_11_1:%.*]] = phi i32 [ [[ADD_1_7:%.*]], [[FOR_BODY4_1]] ], [ [[ADD_8]], [[FOR_BODY4]] ] ; CHECK-NEXT: [[IDX_NEG_1:%.*]] = sub nsw i64 0, [[INDVARS_IV_1]] ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[IDX_NEG_1]] -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[ADD_PTR_1]], align 4, !tbaa [[TBAA3]] +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[ADD_PTR_1]], align 4, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_1_NEG:%.*]] = xor i64 [[INDVARS_IV_1]], -1 ; CHECK-NEXT: [[ADD_PTR_1_1:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_1_NEG]] -; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ADD_PTR_1_1]], align 4, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[ADD_PTR_1_1]], align 4, !tbaa [[TBAA3]] +; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP18]], [[TMP19]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_1_1_NEG:%.*]] = sub nuw nsw i64 -2, [[INDVARS_IV_1]] ; CHECK-NEXT: [[ADD_PTR_1_2:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_1_1_NEG]] ; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[ADD_PTR_1_2]], align 4, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP20]], [[TMP11]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_1_2_NEG:%.*]] = sub nuw nsw i64 -3, [[INDVARS_IV_1]] ; CHECK-NEXT: [[ADD_PTR_1_3:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_1_2_NEG]] ; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ADD_PTR_1_3]], align 4, !tbaa [[TBAA3]] @@ -79,15 +79,15 @@ define dso_local zeroext i32 @foo(ptr noundef %a) #0 { ; CHECK-NEXT: [[INDVARS_IV_NEXT_1_4_NEG:%.*]] = sub nuw nsw i64 -5, [[INDVARS_IV_1]] ; CHECK-NEXT: [[ADD_PTR_1_5:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_1_4_NEG]] ; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[ADD_PTR_1_5]], align 4, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP16]], [[TMP17]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_1_5_NEG:%.*]] = sub nuw nsw i64 -6, [[INDVARS_IV_1]] ; CHECK-NEXT: [[ADD_PTR_1_6:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_1_5_NEG]] -; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[ADD_PTR_1_6]], align 4, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP18]], [[TMP19]] +; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[ADD_PTR_1_6]], align 4, !tbaa [[TBAA3]] +; CHECK-NEXT: [[TMP34:%.*]] = add i32 [[TMP32]], [[TMP33]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_1_6_NEG:%.*]] = sub nuw nsw i64 -7, [[INDVARS_IV_1]] ; CHECK-NEXT: [[ADD_PTR_1_7:%.*]] = getelementptr inbounds i32, ptr getelementptr inbounds nuw (i8, ptr @ARR, i64 396), i64 [[INDVARS_IV_NEXT_1_6_NEG]] ; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[ADD_PTR_1_7]], align 4, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP20]], [[TMP21]] +; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP34]], [[TMP21]] ; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 1 ; CHECK-NEXT: [[ADD_1_7]] = add i32 [[TMP23]], [[SUM_11_1]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_1_7]] = add nuw nsw i64 [[INDVARS_IV_1]], 8