diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2016-08-16 21:02:59 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2016-08-16 21:02:59 +0000 |
commit | 3ca95b020283db6244cab92ede73c969253b6a31 (patch) | |
tree | d16e791e58694facd8f68d3e2797a1eaa8018afc /contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp | |
parent | 27067774dce3388702a4cf744d7096c6fb71b688 (diff) | |
parent | c3aee98e721333f265a88d6bf348e6e468f027d4 (diff) | |
download | src-3ca95b020283db6244cab92ede73c969253b6a31.tar.gz src-3ca95b020283db6244cab92ede73c969253b6a31.zip |
Update llvm to release_39 branch r276489, and resolve conflicts.
Notes
Notes:
svn path=/projects/clang390-import/; revision=304240
Diffstat (limited to 'contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp')
-rw-r--r-- | contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp | 497 |
1 files changed, 380 insertions, 117 deletions
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 0d68f18ad0e5..861a50cf354d 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -16,8 +16,8 @@ // case, we need to generate code to execute these 'left over' iterations. // // The current strategy generates an if-then-else sequence prior to the -// unrolled loop to execute the 'left over' iterations. Other strategies -// include generate a loop before or after the unrolled loop. +// unrolled loop to execute the 'left over' iterations before or after the +// unrolled loop. // //===----------------------------------------------------------------------===// @@ -60,91 +60,220 @@ STATISTIC(NumRuntimeUnrolled, /// than the unroll factor. /// static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, - BasicBlock *LastPrologBB, BasicBlock *PrologEnd, - BasicBlock *OrigPH, BasicBlock *NewPH, - ValueToValueMapTy &VMap, DominatorTree *DT, - LoopInfo *LI, bool PreserveLCSSA) { + BasicBlock *PrologExit, BasicBlock *PreHeader, + BasicBlock *NewPreHeader, ValueToValueMapTy &VMap, + DominatorTree *DT, LoopInfo *LI, bool PreserveLCSSA) { BasicBlock *Latch = L->getLoopLatch(); assert(Latch && "Loop must have a latch"); + BasicBlock *PrologLatch = cast<BasicBlock>(VMap[Latch]); // Create a PHI node for each outgoing value from the original loop // (which means it is an outgoing value from the prolog code too). // The new PHI node is inserted in the prolog end basic block. - // The new PHI name is added as an operand of a PHI node in either + // The new PHI node value is added as an operand of a PHI node in either // the loop header or the loop exit block. - for (succ_iterator SBI = succ_begin(Latch), SBE = succ_end(Latch); - SBI != SBE; ++SBI) { - for (BasicBlock::iterator BBI = (*SBI)->begin(); - PHINode *PN = dyn_cast<PHINode>(BBI); ++BBI) { - + for (BasicBlock *Succ : successors(Latch)) { + for (Instruction &BBI : *Succ) { + PHINode *PN = dyn_cast<PHINode>(&BBI); + // Exit when we passed all PHI nodes. + if (!PN) + break; // Add a new PHI node to the prolog end block and add the // appropriate incoming values. - PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName()+".unr", - PrologEnd->getTerminator()); + PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName() + ".unr", + PrologExit->getFirstNonPHI()); // Adding a value to the new PHI node from the original loop preheader. // This is the value that skips all the prolog code. if (L->contains(PN)) { - NewPN->addIncoming(PN->getIncomingValueForBlock(NewPH), OrigPH); + NewPN->addIncoming(PN->getIncomingValueForBlock(NewPreHeader), + PreHeader); } else { - NewPN->addIncoming(UndefValue::get(PN->getType()), OrigPH); + NewPN->addIncoming(UndefValue::get(PN->getType()), PreHeader); } Value *V = PN->getIncomingValueForBlock(Latch); if (Instruction *I = dyn_cast<Instruction>(V)) { if (L->contains(I)) { - V = VMap[I]; + V = VMap.lookup(I); } } // Adding a value to the new PHI node from the last prolog block // that was created. - NewPN->addIncoming(V, LastPrologBB); + NewPN->addIncoming(V, PrologLatch); // Update the existing PHI node operand with the value from the // new PHI node. How this is done depends on if the existing // PHI node is in the original loop block, or the exit block. if (L->contains(PN)) { - PN->setIncomingValue(PN->getBasicBlockIndex(NewPH), NewPN); + PN->setIncomingValue(PN->getBasicBlockIndex(NewPreHeader), NewPN); } else { - PN->addIncoming(NewPN, PrologEnd); + PN->addIncoming(NewPN, PrologExit); } } } - // Create a branch around the orignal loop, which is taken if there are no + // Create a branch around the original loop, which is taken if there are no // iterations remaining to be executed after running the prologue. - Instruction *InsertPt = PrologEnd->getTerminator(); + Instruction *InsertPt = PrologExit->getTerminator(); IRBuilder<> B(InsertPt); assert(Count != 0 && "nonsensical Count!"); - // If BECount <u (Count - 1) then (BECount + 1) & (Count - 1) == (BECount + 1) - // (since Count is a power of 2). This means %xtraiter is (BECount + 1) and - // and all of the iterations of this loop were executed by the prologue. Note - // that if BECount <u (Count - 1) then (BECount + 1) cannot unsigned-overflow. + // If BECount <u (Count - 1) then (BECount + 1) % Count == (BECount + 1) + // This means %xtraiter is (BECount + 1) and all of the iterations of this + // loop were executed by the prologue. Note that if BECount <u (Count - 1) + // then (BECount + 1) cannot unsigned-overflow. Value *BrLoopExit = B.CreateICmpULT(BECount, ConstantInt::get(BECount->getType(), Count - 1)); BasicBlock *Exit = L->getUniqueExitBlock(); assert(Exit && "Loop must have a single exit block only"); // Split the exit to maintain loop canonicalization guarantees - SmallVector<BasicBlock*, 4> Preds(pred_begin(Exit), pred_end(Exit)); + SmallVector<BasicBlock*, 4> Preds(predecessors(Exit)); SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", DT, LI, PreserveLCSSA); // Add the branch to the exit block (around the unrolled loop) - B.CreateCondBr(BrLoopExit, Exit, NewPH); + B.CreateCondBr(BrLoopExit, Exit, NewPreHeader); + InsertPt->eraseFromParent(); +} + +/// Connect the unrolling epilog code to the original loop. +/// The unrolling epilog code contains code to execute the +/// 'extra' iterations if the run-time trip count modulo the +/// unroll count is non-zero. +/// +/// This function performs the following: +/// - Update PHI nodes at the unrolling loop exit and epilog loop exit +/// - Create PHI nodes at the unrolling loop exit to combine +/// values that exit the unrolling loop code and jump around it. +/// - Update PHI operands in the epilog loop by the new PHI nodes +/// - Branch around the epilog loop if extra iters (ModVal) is zero. +/// +static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit, + BasicBlock *Exit, BasicBlock *PreHeader, + BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader, + ValueToValueMapTy &VMap, DominatorTree *DT, + LoopInfo *LI, bool PreserveLCSSA) { + BasicBlock *Latch = L->getLoopLatch(); + assert(Latch && "Loop must have a latch"); + BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]); + + // Loop structure should be the following: + // + // PreHeader + // NewPreHeader + // Header + // ... + // Latch + // NewExit (PN) + // EpilogPreHeader + // EpilogHeader + // ... + // EpilogLatch + // Exit (EpilogPN) + + // Update PHI nodes at NewExit and Exit. + for (Instruction &BBI : *NewExit) { + PHINode *PN = dyn_cast<PHINode>(&BBI); + // Exit when we passed all PHI nodes. + if (!PN) + break; + // PN should be used in another PHI located in Exit block as + // Exit was split by SplitBlockPredecessors into Exit and NewExit + // Basicaly it should look like: + // NewExit: + // PN = PHI [I, Latch] + // ... + // Exit: + // EpilogPN = PHI [PN, EpilogPreHeader] + // + // There is EpilogPreHeader incoming block instead of NewExit as + // NewExit was spilt 1 more time to get EpilogPreHeader. + assert(PN->hasOneUse() && "The phi should have 1 use"); + PHINode *EpilogPN = cast<PHINode> (PN->use_begin()->getUser()); + assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block"); + + // Add incoming PreHeader from branch around the Loop + PN->addIncoming(UndefValue::get(PN->getType()), PreHeader); + + Value *V = PN->getIncomingValueForBlock(Latch); + Instruction *I = dyn_cast<Instruction>(V); + if (I && L->contains(I)) + // If value comes from an instruction in the loop add VMap value. + V = VMap.lookup(I); + // For the instruction out of the loop, constant or undefined value + // insert value itself. + EpilogPN->addIncoming(V, EpilogLatch); + + assert(EpilogPN->getBasicBlockIndex(EpilogPreHeader) >= 0 && + "EpilogPN should have EpilogPreHeader incoming block"); + // Change EpilogPreHeader incoming block to NewExit. + EpilogPN->setIncomingBlock(EpilogPN->getBasicBlockIndex(EpilogPreHeader), + NewExit); + // Now PHIs should look like: + // NewExit: + // PN = PHI [I, Latch], [undef, PreHeader] + // ... + // Exit: + // EpilogPN = PHI [PN, NewExit], [VMap[I], EpilogLatch] + } + + // Create PHI nodes at NewExit (from the unrolling loop Latch and PreHeader). + // Update corresponding PHI nodes in epilog loop. + for (BasicBlock *Succ : successors(Latch)) { + // Skip this as we already updated phis in exit blocks. + if (!L->contains(Succ)) + continue; + for (Instruction &BBI : *Succ) { + PHINode *PN = dyn_cast<PHINode>(&BBI); + // Exit when we passed all PHI nodes. + if (!PN) + break; + // Add new PHI nodes to the loop exit block and update epilog + // PHIs with the new PHI values. + PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName() + ".unr", + NewExit->getFirstNonPHI()); + // Adding a value to the new PHI node from the unrolling loop preheader. + NewPN->addIncoming(PN->getIncomingValueForBlock(NewPreHeader), PreHeader); + // Adding a value to the new PHI node from the unrolling loop latch. + NewPN->addIncoming(PN->getIncomingValueForBlock(Latch), Latch); + + // Update the existing PHI node operand with the value from the new PHI + // node. Corresponding instruction in epilog loop should be PHI. + PHINode *VPN = cast<PHINode>(VMap[&BBI]); + VPN->setIncomingValue(VPN->getBasicBlockIndex(EpilogPreHeader), NewPN); + } + } + + Instruction *InsertPt = NewExit->getTerminator(); + IRBuilder<> B(InsertPt); + Value *BrLoopExit = B.CreateIsNotNull(ModVal, "lcmp.mod"); + assert(Exit && "Loop must have a single exit block only"); + // Split the exit to maintain loop canonicalization guarantees + SmallVector<BasicBlock*, 4> Preds(predecessors(Exit)); + SplitBlockPredecessors(Exit, Preds, ".epilog-lcssa", DT, LI, + PreserveLCSSA); + // Add the branch to the exit block (around the unrolling loop) + B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit); InsertPt->eraseFromParent(); } /// Create a clone of the blocks in a loop and connect them together. -/// If UnrollProlog is true, loop structure will not be cloned, otherwise a new -/// loop will be created including all cloned blocks, and the iterator of it -/// switches to count NewIter down to 0. +/// If CreateRemainderLoop is false, loop structure will not be cloned, +/// otherwise a new loop will be created including all cloned blocks, and the +/// iterator of it switches to count NewIter down to 0. +/// The cloned blocks should be inserted between InsertTop and InsertBot. +/// If loop structure is cloned InsertTop should be new preheader, InsertBot +/// new loop exit. /// -static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog, +static void CloneLoopBlocks(Loop *L, Value *NewIter, + const bool CreateRemainderLoop, + const bool UseEpilogRemainder, BasicBlock *InsertTop, BasicBlock *InsertBot, + BasicBlock *Preheader, std::vector<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap, LoopInfo *LI) { - BasicBlock *Preheader = L->getLoopPreheader(); + StringRef suffix = UseEpilogRemainder ? "epil" : "prol"; BasicBlock *Header = L->getHeader(); BasicBlock *Latch = L->getLoopLatch(); Function *F = Header->getParent(); @@ -152,7 +281,7 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog, LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO(); Loop *NewLoop = nullptr; Loop *ParentLoop = L->getParentLoop(); - if (!UnrollProlog) { + if (CreateRemainderLoop) { NewLoop = new Loop(); if (ParentLoop) ParentLoop->addChildLoop(NewLoop); @@ -163,7 +292,7 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog, // For each block in the original loop, create a new copy, // and update the value map with the newly created values. for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { - BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".prol", F); + BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, "." + suffix, F); NewBlocks.push_back(NewBB); if (NewLoop) @@ -176,19 +305,20 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog, // For the first block, add a CFG connection to this newly // created block. InsertTop->getTerminator()->setSuccessor(0, NewBB); - } + if (Latch == *BB) { - // For the last block, if UnrollProlog is true, create a direct jump to - // InsertBot. If not, create a loop back to cloned head. + // For the last block, if CreateRemainderLoop is false, create a direct + // jump to InsertBot. If not, create a loop back to cloned head. VMap.erase((*BB)->getTerminator()); BasicBlock *FirstLoopBB = cast<BasicBlock>(VMap[Header]); BranchInst *LatchBR = cast<BranchInst>(NewBB->getTerminator()); IRBuilder<> Builder(LatchBR); - if (UnrollProlog) { + if (!CreateRemainderLoop) { Builder.CreateBr(InsertBot); } else { - PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2, "prol.iter", + PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2, + suffix + ".iter", FirstLoopBB->getFirstNonPHI()); Value *IdxSub = Builder.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1), @@ -207,9 +337,15 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog, // cloned loop. for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { PHINode *NewPHI = cast<PHINode>(VMap[&*I]); - if (UnrollProlog) { - VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader); - cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI); + if (!CreateRemainderLoop) { + if (UseEpilogRemainder) { + unsigned idx = NewPHI->getBasicBlockIndex(Preheader); + NewPHI->setIncomingBlock(idx, InsertTop); + NewPHI->removeIncomingValue(Latch, false); + } else { + VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader); + cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI); + } } else { unsigned idx = NewPHI->getBasicBlockIndex(Preheader); NewPHI->setIncomingBlock(idx, InsertTop); @@ -217,8 +353,8 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog, idx = NewPHI->getBasicBlockIndex(Latch); Value *InVal = NewPHI->getIncomingValue(idx); NewPHI->setIncomingBlock(idx, NewLatch); - if (VMap[InVal]) - NewPHI->setIncomingValue(idx, VMap[InVal]); + if (Value *V = VMap.lookup(InVal)) + NewPHI->setIncomingValue(idx, V); } } if (NewLoop) { @@ -254,11 +390,11 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog, } } -/// Insert code in the prolog code when unrolling a loop with a +/// Insert code in the prolog/epilog code when unrolling a loop with a /// run-time trip-count. /// /// This method assumes that the loop unroll factor is total number -/// of loop bodes in the loop after unrolling. (Some folks refer +/// of loop bodies in the loop after unrolling. (Some folks refer /// to the unroll factor as the number of *extra* copies added). /// We assume also that the loop unroll factor is a power-of-two. So, after /// unrolling the loop, the number of loop bodies executed is 2, @@ -266,37 +402,56 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog, /// instruction in SimplifyCFG.cpp. Then, the backend decides how code for /// the switch instruction is generated. /// +/// ***Prolog case*** /// extraiters = tripcount % loopfactor /// if (extraiters == 0) jump Loop: -/// else jump Prol +/// else jump Prol: /// Prol: LoopBody; /// extraiters -= 1 // Omitted if unroll factor is 2. /// if (extraiters != 0) jump Prol: // Omitted if unroll factor is 2. -/// if (tripcount < loopfactor) jump End +/// if (tripcount < loopfactor) jump End: /// Loop: /// ... /// End: /// -bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, - bool AllowExpensiveTripCount, LoopInfo *LI, - ScalarEvolution *SE, DominatorTree *DT, - bool PreserveLCSSA) { +/// ***Epilog case*** +/// extraiters = tripcount % loopfactor +/// if (tripcount < loopfactor) jump LoopExit: +/// unroll_iters = tripcount - extraiters +/// Loop: LoopBody; (executes unroll_iter times); +/// unroll_iter -= 1 +/// if (unroll_iter != 0) jump Loop: +/// LoopExit: +/// if (extraiters == 0) jump EpilExit: +/// Epil: LoopBody; (executes extraiters times) +/// extraiters -= 1 // Omitted if unroll factor is 2. +/// if (extraiters != 0) jump Epil: // Omitted if unroll factor is 2. +/// EpilExit: + +bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, + bool AllowExpensiveTripCount, + bool UseEpilogRemainder, + LoopInfo *LI, ScalarEvolution *SE, + DominatorTree *DT, bool PreserveLCSSA) { // for now, only unroll loops that contain a single exit if (!L->getExitingBlock()) return false; // Make sure the loop is in canonical form, and there is a single // exit block only. - if (!L->isLoopSimplifyForm() || !L->getUniqueExitBlock()) + if (!L->isLoopSimplifyForm()) + return false; + BasicBlock *Exit = L->getUniqueExitBlock(); // successor out of loop + if (!Exit) return false; - // Use Scalar Evolution to compute the trip count. This allows more - // loops to be unrolled than relying on induction var simplification + // Use Scalar Evolution to compute the trip count. This allows more loops to + // be unrolled than relying on induction var simplification. if (!SE) return false; - // Only unroll loops with a computable trip count and the trip count needs - // to be an int value (allowing a pointer type is a TODO item) + // Only unroll loops with a computable trip count, and the trip count needs + // to be an int value (allowing a pointer type is a TODO item). const SCEV *BECountSC = SE->getBackedgeTakenCount(L); if (isa<SCEVCouldNotCompute>(BECountSC) || !BECountSC->getType()->isIntegerTy()) @@ -304,21 +459,19 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth(); - // Add 1 since the backedge count doesn't include the first loop iteration + // Add 1 since the backedge count doesn't include the first loop iteration. const SCEV *TripCountSC = SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1)); if (isa<SCEVCouldNotCompute>(TripCountSC)) return false; BasicBlock *Header = L->getHeader(); + BasicBlock *PreHeader = L->getLoopPreheader(); + BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator()); const DataLayout &DL = Header->getModule()->getDataLayout(); SCEVExpander Expander(*SE, DL, "loop-unroll"); - if (!AllowExpensiveTripCount && Expander.isHighCostExpansion(TripCountSC, L)) - return false; - - // We only handle cases when the unroll factor is a power of 2. - // Count is the loop unroll factor, the number of extra copies added + 1. - if (!isPowerOf2_32(Count)) + if (!AllowExpensiveTripCount && + Expander.isHighCostExpansion(TripCountSC, L, PreHeaderBR)) return false; // This constraint lets us deal with an overflowing trip count easily; see the @@ -326,51 +479,115 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, if (Log2_32(Count) > BEWidth) return false; - // If this loop is nested, then the loop unroller changes the code in - // parent loop, so the Scalar Evolution pass needs to be run again + // If this loop is nested, then the loop unroller changes the code in the + // parent loop, so the Scalar Evolution pass needs to be run again. if (Loop *ParentLoop = L->getParentLoop()) SE->forgetLoop(ParentLoop); - BasicBlock *PH = L->getLoopPreheader(); BasicBlock *Latch = L->getLoopLatch(); - // It helps to splits the original preheader twice, one for the end of the - // prolog code and one for a new loop preheader - BasicBlock *PEnd = SplitEdge(PH, Header, DT, LI); - BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), DT, LI); - BranchInst *PreHeaderBR = cast<BranchInst>(PH->getTerminator()); + // Loop structure is the following: + // + // PreHeader + // Header + // ... + // Latch + // Exit + + BasicBlock *NewPreHeader; + BasicBlock *NewExit = nullptr; + BasicBlock *PrologExit = nullptr; + BasicBlock *EpilogPreHeader = nullptr; + BasicBlock *PrologPreHeader = nullptr; + + if (UseEpilogRemainder) { + // If epilog remainder + // Split PreHeader to insert a branch around loop for unrolling. + NewPreHeader = SplitBlock(PreHeader, PreHeader->getTerminator(), DT, LI); + NewPreHeader->setName(PreHeader->getName() + ".new"); + // Split Exit to create phi nodes from branch above. + SmallVector<BasicBlock*, 4> Preds(predecessors(Exit)); + NewExit = SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", + DT, LI, PreserveLCSSA); + // Split NewExit to insert epilog remainder loop. + EpilogPreHeader = SplitBlock(NewExit, NewExit->getTerminator(), DT, LI); + EpilogPreHeader->setName(Header->getName() + ".epil.preheader"); + } else { + // If prolog remainder + // Split the original preheader twice to insert prolog remainder loop + PrologPreHeader = SplitEdge(PreHeader, Header, DT, LI); + PrologPreHeader->setName(Header->getName() + ".prol.preheader"); + PrologExit = SplitBlock(PrologPreHeader, PrologPreHeader->getTerminator(), + DT, LI); + PrologExit->setName(Header->getName() + ".prol.loopexit"); + // Split PrologExit to get NewPreHeader. + NewPreHeader = SplitBlock(PrologExit, PrologExit->getTerminator(), DT, LI); + NewPreHeader->setName(PreHeader->getName() + ".new"); + } + // Loop structure should be the following: + // Epilog Prolog + // + // PreHeader PreHeader + // *NewPreHeader *PrologPreHeader + // Header *PrologExit + // ... *NewPreHeader + // Latch Header + // *NewExit ... + // *EpilogPreHeader Latch + // Exit Exit + + // Calculate conditions for branch around loop for unrolling + // in epilog case and around prolog remainder loop in prolog case. // Compute the number of extra iterations required, which is: - // extra iterations = run-time trip count % (loop unroll factor + 1) + // extra iterations = run-time trip count % loop unroll factor + PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator()); Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(), PreHeaderBR); Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(), PreHeaderBR); - IRBuilder<> B(PreHeaderBR); - Value *ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter"); - - // If ModVal is zero, we know that either - // 1. there are no iteration to be run in the prologue loop - // OR - // 2. the addition computing TripCount overflowed - // - // If (2) is true, we know that TripCount really is (1 << BEWidth) and so the - // number of iterations that remain to be run in the original loop is a - // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we - // explicitly check this above). - - Value *BranchVal = B.CreateIsNotNull(ModVal, "lcmp.mod"); - - // Branch to either the extra iterations or the cloned/unrolled loop - // We will fix up the true branch label when adding loop body copies - B.CreateCondBr(BranchVal, PEnd, PEnd); - assert(PreHeaderBR->isUnconditional() && - PreHeaderBR->getSuccessor(0) == PEnd && - "CFG edges in Preheader are not correct"); + Value *ModVal; + // Calculate ModVal = (BECount + 1) % Count. + // Note that TripCount is BECount + 1. + if (isPowerOf2_32(Count)) { + // When Count is power of 2 we don't BECount for epilog case, however we'll + // need it for a branch around unrolling loop for prolog case. + ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter"); + // 1. There are no iterations to be run in the prolog/epilog loop. + // OR + // 2. The addition computing TripCount overflowed. + // + // If (2) is true, we know that TripCount really is (1 << BEWidth) and so + // the number of iterations that remain to be run in the original loop is a + // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we + // explicitly check this above). + } else { + // As (BECount + 1) can potentially unsigned overflow we count + // (BECount % Count) + 1 which is overflow safe as BECount % Count < Count. + Value *ModValTmp = B.CreateURem(BECount, + ConstantInt::get(BECount->getType(), + Count)); + Value *ModValAdd = B.CreateAdd(ModValTmp, + ConstantInt::get(ModValTmp->getType(), 1)); + // At that point (BECount % Count) + 1 could be equal to Count. + // To handle this case we need to take mod by Count one more time. + ModVal = B.CreateURem(ModValAdd, + ConstantInt::get(BECount->getType(), Count), + "xtraiter"); + } + Value *BranchVal = + UseEpilogRemainder ? B.CreateICmpULT(BECount, + ConstantInt::get(BECount->getType(), + Count - 1)) : + B.CreateIsNotNull(ModVal, "lcmp.mod"); + BasicBlock *RemainderLoop = UseEpilogRemainder ? NewExit : PrologPreHeader; + BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit; + // Branch to either remainder (extra iterations) loop or unrolling loop. + B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop); PreHeaderBR->eraseFromParent(); Function *F = Header->getParent(); // Get an ordered list of blocks in the loop to help with the ordering of the - // cloned blocks in the prolog code + // cloned blocks in the prolog/epilog code LoopBlocksDFS LoopBlocks(L); LoopBlocks.perform(LI); @@ -382,34 +599,80 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, std::vector<BasicBlock *> NewBlocks; ValueToValueMapTy VMap; - bool UnrollPrologue = Count == 2; + // For unroll factor 2 remainder loop will have 1 iterations. + // Do not create 1 iteration loop. + bool CreateRemainderLoop = (Count != 2); // Clone all the basic blocks in the loop. If Count is 2, we don't clone // the loop, otherwise we create a cloned loop to execute the extra // iterations. This function adds the appropriate CFG connections. - CloneLoopBlocks(L, ModVal, UnrollPrologue, PH, PEnd, NewBlocks, LoopBlocks, - VMap, LI); - - // Insert the cloned blocks into function just before the original loop - F->getBasicBlockList().splice(PEnd->getIterator(), F->getBasicBlockList(), - NewBlocks[0]->getIterator(), F->end()); - - // Rewrite the cloned instruction operands to use the values - // created when the clone is created. - for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) { - for (BasicBlock::iterator I = NewBlocks[i]->begin(), - E = NewBlocks[i]->end(); - I != E; ++I) { - RemapInstruction(&*I, VMap, - RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); + BasicBlock *InsertBot = UseEpilogRemainder ? Exit : PrologExit; + BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader; + CloneLoopBlocks(L, ModVal, CreateRemainderLoop, UseEpilogRemainder, InsertTop, + InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, LI); + + // Insert the cloned blocks into the function. + F->getBasicBlockList().splice(InsertBot->getIterator(), + F->getBasicBlockList(), + NewBlocks[0]->getIterator(), + F->end()); + + // Loop structure should be the following: + // Epilog Prolog + // + // PreHeader PreHeader + // NewPreHeader PrologPreHeader + // Header PrologHeader + // ... ... + // Latch PrologLatch + // NewExit PrologExit + // EpilogPreHeader NewPreHeader + // EpilogHeader Header + // ... ... + // EpilogLatch Latch + // Exit Exit + + // Rewrite the cloned instruction operands to use the values created when the + // clone is created. + for (BasicBlock *BB : NewBlocks) { + for (Instruction &I : *BB) { + RemapInstruction(&I, VMap, + RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); } } - // Connect the prolog code to the original loop and update the - // PHI functions. - BasicBlock *LastLoopBB = cast<BasicBlock>(VMap[Latch]); - ConnectProlog(L, BECount, Count, LastLoopBB, PEnd, PH, NewPH, VMap, DT, LI, - PreserveLCSSA); + if (UseEpilogRemainder) { + // Connect the epilog code to the original loop and update the + // PHI functions. + ConnectEpilog(L, ModVal, NewExit, Exit, PreHeader, + EpilogPreHeader, NewPreHeader, VMap, DT, LI, + PreserveLCSSA); + + // Update counter in loop for unrolling. + // I should be multiply of Count. + IRBuilder<> B2(NewPreHeader->getTerminator()); + Value *TestVal = B2.CreateSub(TripCount, ModVal, "unroll_iter"); + BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator()); + B2.SetInsertPoint(LatchBR); + PHINode *NewIdx = PHINode::Create(TestVal->getType(), 2, "niter", + Header->getFirstNonPHI()); + Value *IdxSub = + B2.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1), + NewIdx->getName() + ".nsub"); + Value *IdxCmp; + if (LatchBR->getSuccessor(0) == Header) + IdxCmp = B2.CreateIsNotNull(IdxSub, NewIdx->getName() + ".ncmp"); + else + IdxCmp = B2.CreateIsNull(IdxSub, NewIdx->getName() + ".ncmp"); + NewIdx->addIncoming(TestVal, NewPreHeader); + NewIdx->addIncoming(IdxSub, Latch); + LatchBR->setCondition(IdxCmp); + } else { + // Connect the prolog code to the original loop and update the + // PHI functions. + ConnectProlog(L, BECount, Count, PrologExit, PreHeader, NewPreHeader, + VMap, DT, LI, PreserveLCSSA); + } NumRuntimeUnrolled++; return true; } |