LLVM API Documentation

LoopUnrollRuntime.cpp
Go to the documentation of this file.
00001 //===-- UnrollLoopRuntime.cpp - Runtime Loop unrolling utilities ----------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This file implements some loop unrolling utilities for loops with run-time
00011 // trip counts.  See LoopUnroll.cpp for unrolling loops with compile-time
00012 // trip counts.
00013 //
00014 // The functions in this file are used to generate extra code when the
00015 // run-time trip count modulo the unroll factor is not 0.  When this is the
00016 // case, we need to generate code to execute these 'left over' iterations.
00017 //
00018 // The current strategy generates an if-then-else sequence prior to the
00019 // unrolled loop to execute the 'left over' iterations.  Other strategies
00020 // include generate a loop before or after the unrolled loop.
00021 //
00022 //===----------------------------------------------------------------------===//
00023 
00024 #include "llvm/Transforms/Utils/UnrollLoop.h"
00025 #include "llvm/ADT/Statistic.h"
00026 #include "llvm/Analysis/LoopIterator.h"
00027 #include "llvm/Analysis/LoopPass.h"
00028 #include "llvm/Analysis/ScalarEvolution.h"
00029 #include "llvm/Analysis/ScalarEvolutionExpander.h"
00030 #include "llvm/IR/BasicBlock.h"
00031 #include "llvm/IR/Metadata.h"
00032 #include "llvm/Support/Debug.h"
00033 #include "llvm/Support/raw_ostream.h"
00034 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
00035 #include "llvm/Transforms/Utils/Cloning.h"
00036 #include <algorithm>
00037 
00038 using namespace llvm;
00039 
00040 #define DEBUG_TYPE "loop-unroll"
00041 
00042 STATISTIC(NumRuntimeUnrolled,
00043           "Number of loops unrolled with run-time trip counts");
00044 
00045 /// Connect the unrolling prolog code to the original loop.
00046 /// The unrolling prolog code contains code to execute the
00047 /// 'extra' iterations if the run-time trip count modulo the
00048 /// unroll count is non-zero.
00049 ///
00050 /// This function performs the following:
00051 /// - Create PHI nodes at prolog end block to combine values
00052 ///   that exit the prolog code and jump around the prolog.
00053 /// - Add a PHI operand to a PHI node at the loop exit block
00054 ///   for values that exit the prolog and go around the loop.
00055 /// - Branch around the original loop if the trip count is less
00056 ///   than the unroll factor.
00057 ///
00058 static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count,
00059                           BasicBlock *LastPrologBB, BasicBlock *PrologEnd,
00060                           BasicBlock *OrigPH, BasicBlock *NewPH,
00061                           ValueToValueMapTy &VMap, Pass *P) {
00062   BasicBlock *Latch = L->getLoopLatch();
00063   assert(Latch && "Loop must have a latch");
00064 
00065   // Create a PHI node for each outgoing value from the original loop
00066   // (which means it is an outgoing value from the prolog code too).
00067   // The new PHI node is inserted in the prolog end basic block.
00068   // The new PHI name is added as an operand of a PHI node in either
00069   // the loop header or the loop exit block.
00070   for (succ_iterator SBI = succ_begin(Latch), SBE = succ_end(Latch);
00071        SBI != SBE; ++SBI) {
00072     for (BasicBlock::iterator BBI = (*SBI)->begin();
00073          PHINode *PN = dyn_cast<PHINode>(BBI); ++BBI) {
00074 
00075       // Add a new PHI node to the prolog end block and add the
00076       // appropriate incoming values.
00077       PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName()+".unr",
00078                                        PrologEnd->getTerminator());
00079       // Adding a value to the new PHI node from the original loop preheader.
00080       // This is the value that skips all the prolog code.
00081       if (L->contains(PN)) {
00082         NewPN->addIncoming(PN->getIncomingValueForBlock(NewPH), OrigPH);
00083       } else {
00084         NewPN->addIncoming(Constant::getNullValue(PN->getType()), OrigPH);
00085       }
00086 
00087       Value *V = PN->getIncomingValueForBlock(Latch);
00088       if (Instruction *I = dyn_cast<Instruction>(V)) {
00089         if (L->contains(I)) {
00090           V = VMap[I];
00091         }
00092       }
00093       // Adding a value to the new PHI node from the last prolog block
00094       // that was created.
00095       NewPN->addIncoming(V, LastPrologBB);
00096 
00097       // Update the existing PHI node operand with the value from the
00098       // new PHI node.  How this is done depends on if the existing
00099       // PHI node is in the original loop block, or the exit block.
00100       if (L->contains(PN)) {
00101         PN->setIncomingValue(PN->getBasicBlockIndex(NewPH), NewPN);
00102       } else {
00103         PN->addIncoming(NewPN, PrologEnd);
00104       }
00105     }
00106   }
00107 
00108   // Create a branch around the orignal loop, which is taken if the
00109   // trip count is less than the unroll factor.
00110   Instruction *InsertPt = PrologEnd->getTerminator();
00111   Instruction *BrLoopExit =
00112     new ICmpInst(InsertPt, ICmpInst::ICMP_ULT, TripCount,
00113                  ConstantInt::get(TripCount->getType(), Count));
00114   BasicBlock *Exit = L->getUniqueExitBlock();
00115   assert(Exit && "Loop must have a single exit block only");
00116   // Split the exit to maintain loop canonicalization guarantees
00117   SmallVector<BasicBlock*, 4> Preds(pred_begin(Exit), pred_end(Exit));
00118   if (!Exit->isLandingPad()) {
00119     SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", P);
00120   } else {
00121     SmallVector<BasicBlock*, 2> NewBBs;
00122     SplitLandingPadPredecessors(Exit, Preds, ".unr1-lcssa", ".unr2-lcssa",
00123                                 P, NewBBs);
00124   }
00125   // Add the branch to the exit block (around the unrolled loop)
00126   BranchInst::Create(Exit, NewPH, BrLoopExit, InsertPt);
00127   InsertPt->eraseFromParent();
00128 }
00129 
00130 /// Create a clone of the blocks in a loop and connect them together.
00131 /// If UnrollProlog is true, loop structure will not be cloned, otherwise a new
00132 /// loop will be created including all cloned blocks, and the iterator of it
00133 /// switches to count NewIter down to 0.
00134 ///
00135 static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog,
00136                             BasicBlock *InsertTop, BasicBlock *InsertBot,
00137                             std::vector<BasicBlock *> &NewBlocks,
00138                             LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap,
00139                             LoopInfo *LI) {
00140   BasicBlock *Preheader = L->getLoopPreheader();
00141   BasicBlock *Header = L->getHeader();
00142   BasicBlock *Latch = L->getLoopLatch();
00143   Function *F = Header->getParent();
00144   LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
00145   LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
00146   Loop *NewLoop = 0;
00147   Loop *ParentLoop = L->getParentLoop();
00148   if (!UnrollProlog) {
00149     NewLoop = new Loop();
00150     if (ParentLoop)
00151       ParentLoop->addChildLoop(NewLoop);
00152     else
00153       LI->addTopLevelLoop(NewLoop);
00154   }
00155 
00156   // For each block in the original loop, create a new copy,
00157   // and update the value map with the newly created values.
00158   for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
00159     BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".prol", F);
00160     NewBlocks.push_back(NewBB);
00161 
00162     if (NewLoop)
00163       NewLoop->addBasicBlockToLoop(NewBB, LI->getBase());
00164     else if (ParentLoop)
00165       ParentLoop->addBasicBlockToLoop(NewBB, LI->getBase());
00166 
00167     VMap[*BB] = NewBB;
00168     if (Header == *BB) {
00169       // For the first block, add a CFG connection to this newly
00170       // created block.
00171       InsertTop->getTerminator()->setSuccessor(0, NewBB);
00172 
00173     }
00174     if (Latch == *BB) {
00175       // For the last block, if UnrollProlog is true, create a direct jump to
00176       // InsertBot. If not, create a loop back to cloned head.
00177       VMap.erase((*BB)->getTerminator());
00178       BasicBlock *FirstLoopBB = cast<BasicBlock>(VMap[Header]);
00179       BranchInst *LatchBR = cast<BranchInst>(NewBB->getTerminator());
00180       if (UnrollProlog) {
00181         LatchBR->eraseFromParent();
00182         BranchInst::Create(InsertBot, NewBB);
00183       } else {
00184         PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2, "prol.iter",
00185                                           FirstLoopBB->getFirstNonPHI());
00186         IRBuilder<> Builder(LatchBR);
00187         Value *IdxSub =
00188             Builder.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
00189                               NewIdx->getName() + ".sub");
00190         Value *IdxCmp =
00191             Builder.CreateIsNotNull(IdxSub, NewIdx->getName() + ".cmp");
00192         BranchInst::Create(FirstLoopBB, InsertBot, IdxCmp, NewBB);
00193         NewIdx->addIncoming(NewIter, InsertTop);
00194         NewIdx->addIncoming(IdxSub, NewBB);
00195         LatchBR->eraseFromParent();
00196       }
00197     }
00198   }
00199 
00200   // Change the incoming values to the ones defined in the preheader or
00201   // cloned loop.
00202   for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
00203     PHINode *NewPHI = cast<PHINode>(VMap[I]);
00204     if (UnrollProlog) {
00205       VMap[I] = NewPHI->getIncomingValueForBlock(Preheader);
00206       cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
00207     } else {
00208       unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
00209       NewPHI->setIncomingBlock(idx, InsertTop);
00210       BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]);
00211       idx = NewPHI->getBasicBlockIndex(Latch);
00212       Value *InVal = NewPHI->getIncomingValue(idx);
00213       NewPHI->setIncomingBlock(idx, NewLatch);
00214       if (VMap[InVal])
00215         NewPHI->setIncomingValue(idx, VMap[InVal]);
00216     }
00217   }
00218   if (NewLoop) {
00219     // Add unroll disable metadata to disable future unrolling for this loop.
00220     SmallVector<Value *, 4> Vals;
00221     // Reserve first location for self reference to the LoopID metadata node.
00222     Vals.push_back(nullptr);
00223     MDNode *LoopID = NewLoop->getLoopID();
00224     if (LoopID) {
00225       // First remove any existing loop unrolling metadata.
00226       for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
00227         bool IsUnrollMetadata = false;
00228         MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
00229         if (MD) {
00230           const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
00231           IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll.");
00232         }
00233         if (!IsUnrollMetadata) Vals.push_back(LoopID->getOperand(i));
00234       }
00235     }
00236 
00237     LLVMContext &Context = NewLoop->getHeader()->getContext();
00238     SmallVector<Value *, 1> DisableOperands;
00239     DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable"));
00240     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
00241     Vals.push_back(DisableNode);
00242 
00243     MDNode *NewLoopID = MDNode::get(Context, Vals);
00244     // Set operand 0 to refer to the loop id itself.
00245     NewLoopID->replaceOperandWith(0, NewLoopID);
00246     NewLoop->setLoopID(NewLoopID);
00247   }
00248 }
00249 
00250 /// Insert code in the prolog code when unrolling a loop with a
00251 /// run-time trip-count.
00252 ///
00253 /// This method assumes that the loop unroll factor is total number
00254 /// of loop bodes in the loop after unrolling. (Some folks refer
00255 /// to the unroll factor as the number of *extra* copies added).
00256 /// We assume also that the loop unroll factor is a power-of-two. So, after
00257 /// unrolling the loop, the number of loop bodies executed is 2,
00258 /// 4, 8, etc.  Note - LLVM converts the if-then-sequence to a switch
00259 /// instruction in SimplifyCFG.cpp.  Then, the backend decides how code for
00260 /// the switch instruction is generated.
00261 ///
00262 ///        extraiters = tripcount % loopfactor
00263 ///        if (extraiters == 0) jump Loop:
00264 ///        else jump Prol
00265 /// Prol:  LoopBody;
00266 ///        extraiters -= 1                 // Omitted if unroll factor is 2.
00267 ///        if (extraiters != 0) jump Prol: // Omitted if unroll factor is 2.
00268 ///        if (tripcount < loopfactor) jump End
00269 /// Loop:
00270 /// ...
00271 /// End:
00272 ///
00273 bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
00274                                    LPPassManager *LPM) {
00275   // for now, only unroll loops that contain a single exit
00276   if (!L->getExitingBlock())
00277     return false;
00278 
00279   // Make sure the loop is in canonical form, and there is a single
00280   // exit block only.
00281   if (!L->isLoopSimplifyForm() || !L->getUniqueExitBlock())
00282     return false;
00283 
00284   // Use Scalar Evolution to compute the trip count.  This allows more
00285   // loops to be unrolled than relying on induction var simplification
00286   if (!LPM)
00287     return false;
00288   ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>();
00289   if (!SE)
00290     return false;
00291 
00292   // Only unroll loops with a computable trip count and the trip count needs
00293   // to be an int value (allowing a pointer type is a TODO item)
00294   const SCEV *BECount = SE->getBackedgeTakenCount(L);
00295   if (isa<SCEVCouldNotCompute>(BECount) || !BECount->getType()->isIntegerTy())
00296     return false;
00297 
00298   // If BECount is INT_MAX, we can't compute trip-count without overflow.
00299   if (BECount->isAllOnesValue())
00300     return false;
00301 
00302   // Add 1 since the backedge count doesn't include the first loop iteration
00303   const SCEV *TripCountSC =
00304     SE->getAddExpr(BECount, SE->getConstant(BECount->getType(), 1));
00305   if (isa<SCEVCouldNotCompute>(TripCountSC))
00306     return false;
00307 
00308   // We only handle cases when the unroll factor is a power of 2.
00309   // Count is the loop unroll factor, the number of extra copies added + 1.
00310   if ((Count & (Count-1)) != 0)
00311     return false;
00312 
00313   // If this loop is nested, then the loop unroller changes the code in
00314   // parent loop, so the Scalar Evolution pass needs to be run again
00315   if (Loop *ParentLoop = L->getParentLoop())
00316     SE->forgetLoop(ParentLoop);
00317 
00318   BasicBlock *PH = L->getLoopPreheader();
00319   BasicBlock *Header = L->getHeader();
00320   BasicBlock *Latch = L->getLoopLatch();
00321   // It helps to splits the original preheader twice, one for the end of the
00322   // prolog code and one for a new loop preheader
00323   BasicBlock *PEnd = SplitEdge(PH, Header, LPM->getAsPass());
00324   BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), LPM->getAsPass());
00325   BranchInst *PreHeaderBR = cast<BranchInst>(PH->getTerminator());
00326 
00327   // Compute the number of extra iterations required, which is:
00328   //  extra iterations = run-time trip count % (loop unroll factor + 1)
00329   SCEVExpander Expander(*SE, "loop-unroll");
00330   Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),
00331                                             PreHeaderBR);
00332 
00333   IRBuilder<> B(PreHeaderBR);
00334   Value *ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");
00335 
00336   // Check if for no extra iterations, then jump to cloned/unrolled loop.
00337   // We have to check that the trip count computation didn't overflow when
00338   // adding one to the backedge taken count.
00339   Value *LCmp = B.CreateIsNotNull(ModVal, "lcmp.mod");
00340   Value *OverflowCheck = B.CreateIsNull(TripCount, "lcmp.overflow");
00341   Value *BranchVal = B.CreateOr(OverflowCheck, LCmp, "lcmp.or");
00342 
00343   // Branch to either the extra iterations or the cloned/unrolled loop
00344   // We will fix up the true branch label when adding loop body copies
00345   BranchInst::Create(PEnd, PEnd, BranchVal, PreHeaderBR);
00346   assert(PreHeaderBR->isUnconditional() &&
00347          PreHeaderBR->getSuccessor(0) == PEnd &&
00348          "CFG edges in Preheader are not correct");
00349   PreHeaderBR->eraseFromParent();
00350   Function *F = Header->getParent();
00351   // Get an ordered list of blocks in the loop to help with the ordering of the
00352   // cloned blocks in the prolog code
00353   LoopBlocksDFS LoopBlocks(L);
00354   LoopBlocks.perform(LI);
00355 
00356   //
00357   // For each extra loop iteration, create a copy of the loop's basic blocks
00358   // and generate a condition that branches to the copy depending on the
00359   // number of 'left over' iterations.
00360   //
00361   std::vector<BasicBlock *> NewBlocks;
00362   ValueToValueMapTy VMap;
00363 
00364   // If unroll count is 2 and we can't overflow in tripcount computation (which
00365   // is BECount + 1), then we don't need a loop for prologue, and we can unroll
00366   // it. We can be sure that we don't overflow only if tripcount is a constant.
00367   bool UnrollPrologue = (Count == 2 && isa<ConstantInt>(TripCount));
00368 
00369   // Clone all the basic blocks in the loop. If Count is 2, we don't clone
00370   // the loop, otherwise we create a cloned loop to execute the extra
00371   // iterations. This function adds the appropriate CFG connections.
00372   CloneLoopBlocks(L, ModVal, UnrollPrologue, PH, PEnd, NewBlocks, LoopBlocks,
00373                   VMap, LI);
00374 
00375   // Insert the cloned blocks into function just before the original loop
00376   F->getBasicBlockList().splice(PEnd, F->getBasicBlockList(), NewBlocks[0],
00377                                 F->end());
00378 
00379   // Rewrite the cloned instruction operands to use the values
00380   // created when the clone is created.
00381   for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) {
00382     for (BasicBlock::iterator I = NewBlocks[i]->begin(),
00383                               E = NewBlocks[i]->end();
00384          I != E; ++I) {
00385       RemapInstruction(I, VMap,
00386                        RF_NoModuleLevelChanges | RF_IgnoreMissingEntries);
00387     }
00388   }
00389 
00390   // Connect the prolog code to the original loop and update the
00391   // PHI functions.
00392   BasicBlock *LastLoopBB = cast<BasicBlock>(VMap[Latch]);
00393   ConnectProlog(L, TripCount, Count, LastLoopBB, PEnd, PH, NewPH, VMap,
00394                 LPM->getAsPass());
00395   NumRuntimeUnrolled++;
00396   return true;
00397 }