LLVM  9.0.0svn
LoopUnrollAndJam.cpp
Go to the documentation of this file.
1 //===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements loop unroll and jam as a routine, much like
10 // LoopUnroll.cpp implements loop unroll.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/ADT/SmallPtrSet.h"
15 #include "llvm/ADT/Statistic.h"
21 #include "llvm/Analysis/LoopPass.h"
26 #include "llvm/IR/BasicBlock.h"
27 #include "llvm/IR/DataLayout.h"
29 #include "llvm/IR/Dominators.h"
30 #include "llvm/IR/IntrinsicInst.h"
31 #include "llvm/IR/LLVMContext.h"
32 #include "llvm/Support/Debug.h"
40 using namespace llvm;
41 
42 #define DEBUG_TYPE "loop-unroll-and-jam"
43 
44 STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
45 STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
46 
48 
49 // Partition blocks in an outer/inner loop pair into blocks before and after
50 // the loop
51 static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
52  BasicBlockSet &ForeBlocks,
53  BasicBlockSet &SubLoopBlocks,
54  BasicBlockSet &AftBlocks,
55  DominatorTree *DT) {
56  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
57  SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
58 
59  for (BasicBlock *BB : L->blocks()) {
60  if (!SubLoop->contains(BB)) {
61  if (DT->dominates(SubLoopLatch, BB))
62  AftBlocks.insert(BB);
63  else
64  ForeBlocks.insert(BB);
65  }
66  }
67 
68  // Check that all blocks in ForeBlocks together dominate the subloop
69  // TODO: This might ideally be done better with a dominator/postdominators.
70  BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
71  for (BasicBlock *BB : ForeBlocks) {
72  if (BB == SubLoopPreHeader)
73  continue;
74  Instruction *TI = BB->getTerminator();
75  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
76  if (!ForeBlocks.count(TI->getSuccessor(i)))
77  return false;
78  }
79 
80  return true;
81 }
82 
83 // Looks at the phi nodes in Header for values coming from Latch. For these
84 // instructions and all their operands calls Visit on them, keeping going for
85 // all the operands in AftBlocks. Returns false if Visit returns false,
86 // otherwise returns true. This is used to process the instructions in the
87 // Aft blocks that need to be moved before the subloop. It is used in two
88 // places. One to check that the required set of instructions can be moved
89 // before the loop. Then to collect the instructions to actually move in
90 // moveHeaderPhiOperandsToForeBlocks.
91 template <typename T>
92 static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,
93  BasicBlockSet &AftBlocks, T Visit) {
95  for (auto &Phi : Header->phis()) {
96  Value *V = Phi.getIncomingValueForBlock(Latch);
97  if (Instruction *I = dyn_cast<Instruction>(V))
98  Worklist.push_back(I);
99  }
100 
101  while (!Worklist.empty()) {
102  Instruction *I = Worklist.back();
103  Worklist.pop_back();
104  if (!Visit(I))
105  return false;
106 
107  if (AftBlocks.count(I->getParent()))
108  for (auto &U : I->operands())
109  if (Instruction *II = dyn_cast<Instruction>(U))
110  Worklist.push_back(II);
111  }
112 
113  return true;
114 }
115 
116 // Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
118  BasicBlock *Latch,
119  Instruction *InsertLoc,
120  BasicBlockSet &AftBlocks) {
121  // We need to ensure we move the instructions in the correct order,
122  // starting with the earliest required instruction and moving forward.
123  std::vector<Instruction *> Visited;
124  processHeaderPhiOperands(Header, Latch, AftBlocks,
125  [&Visited, &AftBlocks](Instruction *I) {
126  if (AftBlocks.count(I->getParent()))
127  Visited.push_back(I);
128  return true;
129  });
130 
131  // Move all instructions in program order to before the InsertLoc
132  BasicBlock *InsertLocBB = InsertLoc->getParent();
133  for (Instruction *I : reverse(Visited)) {
134  if (I->getParent() != InsertLocBB)
135  I->moveBefore(InsertLoc);
136  }
137 }
138 
139 /*
140  This method performs Unroll and Jam. For a simple loop like:
141  for (i = ..)
142  Fore(i)
143  for (j = ..)
144  SubLoop(i, j)
145  Aft(i)
146 
147  Instead of doing normal inner or outer unrolling, we do:
148  for (i = .., i+=2)
149  Fore(i)
150  Fore(i+1)
151  for (j = ..)
152  SubLoop(i, j)
153  SubLoop(i+1, j)
154  Aft(i)
155  Aft(i+1)
156 
157  So the outer loop is essetially unrolled and then the inner loops are fused
158  ("jammed") together into a single loop. This can increase speed when there
159  are loads in SubLoop that are invariant to i, as they become shared between
160  the now jammed inner loops.
161 
162  We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
163  Fore blocks are those before the inner loop, Aft are those after. Normal
164  Unroll code is used to copy each of these sets of blocks and the results are
165  combined together into the final form above.
166 
167  isSafeToUnrollAndJam should be used prior to calling this to make sure the
168  unrolling will be valid. Checking profitablility is also advisable.
169 
170  If EpilogueLoop is non-null, it receives the epilogue loop (if it was
171  necessary to create one and not fully unrolled).
172 */
174  Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple,
175  bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
176  AssumptionCache *AC, OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) {
177 
178  // When we enter here we should have already checked that it is safe
179  BasicBlock *Header = L->getHeader();
180  assert(L->getSubLoops().size() == 1);
181  Loop *SubLoop = *L->begin();
182 
183  // Don't enter the unroll code if there is nothing to do.
184  if (TripCount == 0 && Count < 2) {
185  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n");
187  }
188 
189  assert(Count > 0);
190  assert(TripMultiple > 0);
191  assert(TripCount == 0 || TripCount % TripMultiple == 0);
192 
193  // Are we eliminating the loop control altogether?
194  bool CompletelyUnroll = (Count == TripCount);
195 
196  // We use the runtime remainder in cases where we don't know trip multiple
197  if (TripMultiple == 1 || TripMultiple % Count != 0) {
198  if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
199  /*UseEpilogRemainder*/ true,
200  UnrollRemainder, /*ForgetAllSCEV*/ false,
201  LI, SE, DT, AC, true, EpilogueLoop)) {
202  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
203  "generated when assuming runtime trip count\n");
205  }
206  }
207 
208  // Notify ScalarEvolution that the loop will be substantially changed,
209  // if not outright eliminated.
210  if (SE) {
211  SE->forgetLoop(L);
212  SE->forgetLoop(SubLoop);
213  }
214 
215  using namespace ore;
216  // Report the unrolling decision.
217  if (CompletelyUnroll) {
218  LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
219  << Header->getName() << " with trip count " << TripCount
220  << "!\n");
221  ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
222  L->getHeader())
223  << "completely unroll and jammed loop with "
224  << NV("UnrollCount", TripCount) << " iterations");
225  } else {
226  auto DiagBuilder = [&]() {
227  OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
228  L->getHeader());
229  return Diag << "unroll and jammed loop by a factor of "
230  << NV("UnrollCount", Count);
231  };
232 
233  LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
234  << " by " << Count);
235  if (TripMultiple != 1) {
236  LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
237  ORE->emit([&]() {
238  return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
239  << " trips per branch";
240  });
241  } else {
242  LLVM_DEBUG(dbgs() << " with run-time trip count");
243  ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
244  }
245  LLVM_DEBUG(dbgs() << "!\n");
246  }
247 
248  BasicBlock *Preheader = L->getLoopPreheader();
249  BasicBlock *LatchBlock = L->getLoopLatch();
250  BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
251  assert(Preheader && LatchBlock && Header);
252  assert(BI && !BI->isUnconditional());
253  bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
254  BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
255  bool SubLoopContinueOnTrue = SubLoop->contains(
256  SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
257 
258  // Partition blocks in an outer/inner loop pair into blocks before and after
259  // the loop
260  BasicBlockSet SubLoopBlocks;
261  BasicBlockSet ForeBlocks;
262  BasicBlockSet AftBlocks;
263  partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
264  DT);
265 
266  // We keep track of the entering/first and exiting/last block of each of
267  // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
268  // blocks easier.
269  std::vector<BasicBlock *> ForeBlocksFirst;
270  std::vector<BasicBlock *> ForeBlocksLast;
271  std::vector<BasicBlock *> SubLoopBlocksFirst;
272  std::vector<BasicBlock *> SubLoopBlocksLast;
273  std::vector<BasicBlock *> AftBlocksFirst;
274  std::vector<BasicBlock *> AftBlocksLast;
275  ForeBlocksFirst.push_back(Header);
276  ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
277  SubLoopBlocksFirst.push_back(SubLoop->getHeader());
278  SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
279  AftBlocksFirst.push_back(SubLoop->getExitBlock());
280  AftBlocksLast.push_back(L->getExitingBlock());
281  // Maps Blocks[0] -> Blocks[It]
282  ValueToValueMapTy LastValueMap;
283 
284  // Move any instructions from fore phi operands from AftBlocks into Fore.
286  Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(),
287  AftBlocks);
288 
289  // The current on-the-fly SSA update requires blocks to be processed in
290  // reverse postorder so that LastValueMap contains the correct value at each
291  // exit.
292  LoopBlocksDFS DFS(L);
293  DFS.perform(LI);
294  // Stash the DFS iterators before adding blocks to the loop.
295  LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
296  LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
297 
298  if (Header->getParent()->isDebugInfoForProfiling())
299  for (BasicBlock *BB : L->getBlocks())
300  for (Instruction &I : *BB)
301  if (!isa<DbgInfoIntrinsic>(&I))
302  if (const DILocation *DIL = I.getDebugLoc()) {
303  auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(Count);
304  if (NewDIL)
305  I.setDebugLoc(NewDIL.getValue());
306  else
307  LLVM_DEBUG(dbgs()
308  << "Failed to create new discriminator: "
309  << DIL->getFilename() << " Line: " << DIL->getLine());
310  }
311 
312  // Copy all blocks
313  for (unsigned It = 1; It != Count; ++It) {
314  std::vector<BasicBlock *> NewBlocks;
315  // Maps Blocks[It] -> Blocks[It-1]
316  DenseMap<Value *, Value *> PrevItValueMap;
317 
318  for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
319  ValueToValueMapTy VMap;
320  BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
321  Header->getParent()->getBasicBlockList().push_back(New);
322 
323  if (ForeBlocks.count(*BB)) {
324  L->addBasicBlockToLoop(New, *LI);
325 
326  if (*BB == ForeBlocksFirst[0])
327  ForeBlocksFirst.push_back(New);
328  if (*BB == ForeBlocksLast[0])
329  ForeBlocksLast.push_back(New);
330  } else if (SubLoopBlocks.count(*BB)) {
331  SubLoop->addBasicBlockToLoop(New, *LI);
332 
333  if (*BB == SubLoopBlocksFirst[0])
334  SubLoopBlocksFirst.push_back(New);
335  if (*BB == SubLoopBlocksLast[0])
336  SubLoopBlocksLast.push_back(New);
337  } else if (AftBlocks.count(*BB)) {
338  L->addBasicBlockToLoop(New, *LI);
339 
340  if (*BB == AftBlocksFirst[0])
341  AftBlocksFirst.push_back(New);
342  if (*BB == AftBlocksLast[0])
343  AftBlocksLast.push_back(New);
344  } else {
345  llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
346  }
347 
348  // Update our running maps of newest clones
349  PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
350  LastValueMap[*BB] = New;
351  for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
352  VI != VE; ++VI) {
353  PrevItValueMap[VI->second] =
354  const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
355  LastValueMap[VI->first] = VI->second;
356  }
357 
358  NewBlocks.push_back(New);
359 
360  // Update DomTree:
361  if (*BB == ForeBlocksFirst[0])
362  DT->addNewBlock(New, ForeBlocksLast[It - 1]);
363  else if (*BB == SubLoopBlocksFirst[0])
364  DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
365  else if (*BB == AftBlocksFirst[0])
366  DT->addNewBlock(New, AftBlocksLast[It - 1]);
367  else {
368  // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
369  // structure.
370  auto BBDomNode = DT->getNode(*BB);
371  auto BBIDom = BBDomNode->getIDom();
372  BasicBlock *OriginalBBIDom = BBIDom->getBlock();
373  assert(OriginalBBIDom);
374  assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
375  DT->addNewBlock(
376  New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
377  }
378  }
379 
380  // Remap all instructions in the most recent iteration
381  for (BasicBlock *NewBlock : NewBlocks) {
382  for (Instruction &I : *NewBlock) {
383  ::remapInstruction(&I, LastValueMap);
384  if (auto *II = dyn_cast<IntrinsicInst>(&I))
385  if (II->getIntrinsicID() == Intrinsic::assume)
386  AC->registerAssumption(II);
387  }
388  }
389 
390  // Alter the ForeBlocks phi's, pointing them at the latest version of the
391  // value from the previous iteration's phis
392  for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
393  Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
394  assert(OldValue && "should have incoming edge from Aft[It]");
395  Value *NewValue = OldValue;
396  if (Value *PrevValue = PrevItValueMap[OldValue])
397  NewValue = PrevValue;
398 
399  assert(Phi.getNumOperands() == 2);
400  Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
401  Phi.setIncomingValue(0, NewValue);
402  Phi.removeIncomingValue(1);
403  }
404  }
405 
406  // Now that all the basic blocks for the unrolled iterations are in place,
407  // finish up connecting the blocks and phi nodes. At this point LastValueMap
408  // is the last unrolled iterations values.
409 
410  // Update Phis in BB from OldBB to point to NewBB
411  auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB,
412  BasicBlock *NewBB) {
413  for (PHINode &Phi : BB->phis()) {
414  int I = Phi.getBasicBlockIndex(OldBB);
415  Phi.setIncomingBlock(I, NewBB);
416  }
417  };
418  // Update Phis in BB from OldBB to point to NewBB and use the latest value
419  // from LastValueMap
420  auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
421  BasicBlock *NewBB,
422  ValueToValueMapTy &LastValueMap) {
423  for (PHINode &Phi : BB->phis()) {
424  for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
425  if (Phi.getIncomingBlock(b) == OldBB) {
426  Value *OldValue = Phi.getIncomingValue(b);
427  if (Value *LastValue = LastValueMap[OldValue])
428  Phi.setIncomingValue(b, LastValue);
429  Phi.setIncomingBlock(b, NewBB);
430  break;
431  }
432  }
433  }
434  };
435  // Move all the phis from Src into Dest
436  auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
437  Instruction *insertPoint = Dest->getFirstNonPHI();
438  while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
439  Phi->moveBefore(insertPoint);
440  };
441 
442  // Update the PHI values outside the loop to point to the last block
443  updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
444  LastValueMap);
445 
446  // Update ForeBlocks successors and phi nodes
447  BranchInst *ForeTerm =
448  cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
449  BasicBlock *Dest = SubLoopBlocksFirst[0];
450  ForeTerm->setSuccessor(0, Dest);
451 
452  if (CompletelyUnroll) {
453  while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
454  Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
455  Phi->getParent()->getInstList().erase(Phi);
456  }
457  } else {
458  // Update the PHI values to point to the last aft block
459  updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
460  AftBlocksLast.back(), LastValueMap);
461  }
462 
463  for (unsigned It = 1; It != Count; It++) {
464  // Remap ForeBlock successors from previous iteration to this
465  BranchInst *ForeTerm =
466  cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
467  BasicBlock *Dest = ForeBlocksFirst[It];
468  ForeTerm->setSuccessor(0, Dest);
469  }
470 
471  // Subloop successors and phis
472  BranchInst *SubTerm =
473  cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
474  SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
475  SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
476  updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0],
477  ForeBlocksLast.back());
478  updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0],
479  SubLoopBlocksLast.back());
480 
481  for (unsigned It = 1; It != Count; It++) {
482  // Replace the conditional branch of the previous iteration subloop with an
483  // unconditional one to this one
484  BranchInst *SubTerm =
485  cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
486  BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
487  SubTerm->eraseFromParent();
488 
489  updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It],
490  ForeBlocksLast.back());
491  updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It],
492  SubLoopBlocksLast.back());
493  movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
494  }
495 
496  // Aft blocks successors and phis
497  BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
498  if (CompletelyUnroll) {
499  BranchInst::Create(LoopExit, Term);
500  Term->eraseFromParent();
501  } else {
502  Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
503  }
504  updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
505  SubLoopBlocksLast.back());
506 
507  for (unsigned It = 1; It != Count; It++) {
508  // Replace the conditional branch of the previous iteration subloop with an
509  // unconditional one to this one
510  BranchInst *AftTerm =
511  cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
512  BranchInst::Create(AftBlocksFirst[It], AftTerm);
513  AftTerm->eraseFromParent();
514 
515  updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It],
516  SubLoopBlocksLast.back());
517  movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
518  }
519 
520  // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
521  // new ones required.
522  if (Count != 1) {
524  DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
525  SubLoopBlocksFirst[0]);
527  SubLoopBlocksLast[0], AftBlocksFirst[0]);
528 
530  ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
532  SubLoopBlocksLast.back(), AftBlocksFirst[0]);
533  DT->applyUpdates(DTUpdates);
534  }
535 
536  // Merge adjacent basic blocks, if possible.
537  SmallPtrSet<BasicBlock *, 16> MergeBlocks;
538  MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
539  MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
540  MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
542  while (!MergeBlocks.empty()) {
543  BasicBlock *BB = *MergeBlocks.begin();
545  if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) {
546  BasicBlock *Dest = Term->getSuccessor(0);
547  BasicBlock *Fold = Dest->getUniquePredecessor();
548  if (MergeBlockIntoPredecessor(Dest, &DTU, LI)) {
549  // Don't remove BB and add Fold as they are the same BB
550  assert(Fold == BB);
551  (void)Fold;
552  MergeBlocks.erase(Dest);
553  } else
554  MergeBlocks.erase(BB);
555  } else
556  MergeBlocks.erase(BB);
557  }
558 
559  // At this point, the code is well formed. We now do a quick sweep over the
560  // inserted code, doing constant propagation and dead code elimination as we
561  // go.
562  simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC);
563  simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC);
564 
565  NumCompletelyUnrolledAndJammed += CompletelyUnroll;
566  ++NumUnrolledAndJammed;
567 
568 #ifndef NDEBUG
569  // We shouldn't have done anything to break loop simplify form or LCSSA.
570  Loop *OuterL = L->getParentLoop();
571  Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop);
572  assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
573  if (!CompletelyUnroll)
575  assert(SubLoop->isLoopSimplifyForm());
576  assert(DT->verify());
577 #endif
578 
579  // Update LoopInfo if the loop is completely removed.
580  if (CompletelyUnroll)
581  LI->erase(L);
582 
583  return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
585 }
586 
587 static bool getLoadsAndStores(BasicBlockSet &Blocks,
588  SmallVector<Value *, 4> &MemInstr) {
589  // Scan the BBs and collect legal loads and stores.
590  // Returns false if non-simple loads/stores are found.
591  for (BasicBlock *BB : Blocks) {
592  for (Instruction &I : *BB) {
593  if (auto *Ld = dyn_cast<LoadInst>(&I)) {
594  if (!Ld->isSimple())
595  return false;
596  MemInstr.push_back(&I);
597  } else if (auto *St = dyn_cast<StoreInst>(&I)) {
598  if (!St->isSimple())
599  return false;
600  MemInstr.push_back(&I);
601  } else if (I.mayReadOrWriteMemory()) {
602  return false;
603  }
604  }
605  }
606  return true;
607 }
608 
611  unsigned LoopDepth, bool InnerLoop,
612  DependenceInfo &DI) {
613  // Use DA to check for dependencies between loads and stores that make unroll
614  // and jam invalid
615  for (Value *I : Earlier) {
616  for (Value *J : Later) {
617  Instruction *Src = cast<Instruction>(I);
618  Instruction *Dst = cast<Instruction>(J);
619  if (Src == Dst)
620  continue;
621  // Ignore Input dependencies.
622  if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
623  continue;
624 
625  // Track dependencies, and if we find them take a conservative approach
626  // by allowing only = or < (not >), altough some > would be safe
627  // (depending upon unroll width).
628  // For the inner loop, we need to disallow any (> <) dependencies
629  // FIXME: Allow > so long as distance is less than unroll width
630  if (auto D = DI.depends(Src, Dst, true)) {
631  assert(D->isOrdered() && "Expected an output, flow or anti dep.");
632 
633  if (D->isConfused()) {
634  LLVM_DEBUG(dbgs() << " Confused dependency between:\n"
635  << " " << *Src << "\n"
636  << " " << *Dst << "\n");
637  return false;
638  }
639  if (!InnerLoop) {
640  if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT) {
641  LLVM_DEBUG(dbgs() << " > dependency between:\n"
642  << " " << *Src << "\n"
643  << " " << *Dst << "\n");
644  return false;
645  }
646  } else {
647  assert(LoopDepth + 1 <= D->getLevels());
648  if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT &&
649  D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT) {
650  LLVM_DEBUG(dbgs() << " < > dependency between:\n"
651  << " " << *Src << "\n"
652  << " " << *Dst << "\n");
653  return false;
654  }
655  }
656  }
657  }
658  }
659  return true;
660 }
661 
662 static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks,
663  BasicBlockSet &SubLoopBlocks,
664  BasicBlockSet &AftBlocks, DependenceInfo &DI) {
665  // Get all loads/store pairs for each blocks
666  SmallVector<Value *, 4> ForeMemInstr;
667  SmallVector<Value *, 4> SubLoopMemInstr;
668  SmallVector<Value *, 4> AftMemInstr;
669  if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) ||
670  !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) ||
671  !getLoadsAndStores(AftBlocks, AftMemInstr))
672  return false;
673 
674  // Check for dependencies between any blocks that may change order
675  unsigned LoopDepth = L->getLoopDepth();
676  return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false,
677  DI) &&
678  checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) &&
679  checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false,
680  DI) &&
681  checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true,
682  DI);
683 }
684 
686  DependenceInfo &DI) {
687  /* We currently handle outer loops like this:
688  |
689  ForeFirst <----\ }
690  Blocks | } ForeBlocks
691  ForeLast | }
692  | |
693  SubLoopFirst <\ | }
694  Blocks | | } SubLoopBlocks
695  SubLoopLast -/ | }
696  | |
697  AftFirst | }
698  Blocks | } AftBlocks
699  AftLast ------/ }
700  |
701 
702  There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
703  and AftBlocks, providing that there is one edge from Fores to SubLoops,
704  one edge from SubLoops to Afts and a single outer loop exit (from Afts).
705  In practice we currently limit Aft blocks to a single block, and limit
706  things further in the profitablility checks of the unroll and jam pass.
707 
708  Because of the way we rearrange basic blocks, we also require that
709  the Fore blocks on all unrolled iterations are safe to move before the
710  SubLoop blocks of all iterations. So we require that the phi node looping
711  operands of ForeHeader can be moved to at least the end of ForeEnd, so that
712  we can arrange cloned Fore Blocks before the subloop and match up Phi's
713  correctly.
714 
715  i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2.
716  It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2.
717 
718  There are then a number of checks along the lines of no calls, no
719  exceptions, inner loop IV is consistent, etc. Note that for loops requiring
720  runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
721  UnrollAndJamLoop if the trip count cannot be easily calculated.
722  */
723 
724  if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
725  return false;
726  Loop *SubLoop = L->getSubLoops()[0];
727  if (!SubLoop->isLoopSimplifyForm())
728  return false;
729 
730  BasicBlock *Header = L->getHeader();
731  BasicBlock *Latch = L->getLoopLatch();
732  BasicBlock *Exit = L->getExitingBlock();
733  BasicBlock *SubLoopHeader = SubLoop->getHeader();
734  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
735  BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
736 
737  if (Latch != Exit)
738  return false;
739  if (SubLoopLatch != SubLoopExit)
740  return false;
741 
742  if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken()) {
743  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");
744  return false;
745  }
746 
747  // Split blocks into Fore/SubLoop/Aft based on dominators
748  BasicBlockSet SubLoopBlocks;
749  BasicBlockSet ForeBlocks;
750  BasicBlockSet AftBlocks;
751  if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks,
752  AftBlocks, &DT)) {
753  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n");
754  return false;
755  }
756 
757  // Aft blocks may need to move instructions to fore blocks, which becomes more
758  // difficult if there are multiple (potentially conditionally executed)
759  // blocks. For now we just exclude loops with multiple aft blocks.
760  if (AftBlocks.size() != 1) {
761  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle "
762  "multiple blocks after the loop\n");
763  return false;
764  }
765 
766  // Check inner loop backedge count is consistent on all iterations of the
767  // outer loop
768  if (!hasIterationCountInvariantInParent(SubLoop, SE)) {
769  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is "
770  "not consistent on each iteration\n");
771  return false;
772  }
773 
774  // Check the loop safety info for exceptions.
776  LSI.computeLoopSafetyInfo(L);
777  if (LSI.anyBlockMayThrow()) {
778  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n");
779  return false;
780  }
781 
782  // We've ruled out the easy stuff and now need to check that there are no
783  // interdependencies which may prevent us from moving the:
784  // ForeBlocks before Subloop and AftBlocks.
785  // Subloop before AftBlocks.
786  // ForeBlock phi operands before the subloop
787 
788  // Make sure we can move all instructions we need to before the subloop
790  Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {
791  if (SubLoop->contains(I->getParent()))
792  return false;
793  if (AftBlocks.count(I->getParent())) {
794  // If we hit a phi node in afts we know we are done (probably
795  // LCSSA)
796  if (isa<PHINode>(I))
797  return false;
798  // Can't move instructions with side effects or memory
799  // reads/writes
800  if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
801  return false;
802  }
803  // Keep going
804  return true;
805  })) {
806  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required "
807  "instructions after subloop to before it\n");
808  return false;
809  }
810 
811  // Check for memory dependencies which prohibit the unrolling we are doing.
812  // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
813  // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
814  if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI)) {
815  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n");
816  return false;
817  }
818 
819  return true;
820 }
virtual void computeLoopSafetyInfo(const Loop *CurLoop)
Computes safety information for a loop checks loop body & header for the possibility of may throw exc...
Definition: MustExecute.cpp:44
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks &#39;this&#39; from the containing basic block and deletes it.
Definition: Instruction.cpp:67
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:641
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
Definition: LoopInfoImpl.h:244
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:224
DiagnosticInfoOptimizationBase::Argument NV
This class represents lattice values for constants.
Definition: AllocatorList.h:23
bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr)
Attempts to merge a block into its predecessor, if possible.
BasicBlock * getSuccessor(unsigned Idx) const
Return the specified successor. This instruction must be a terminator.
static bool getLoadsAndStores(BasicBlockSet &Blocks, SmallVector< Value *, 4 > &MemInstr)
bool isRecursivelyLCSSAForm(DominatorTree &DT, const LoopInfo &LI) const
Return true if this Loop and all inner subloops are in LCSSA form.
Definition: LoopInfo.cpp:416
unsigned getLoopDepth() const
Return the nesting level of this loop.
Definition: LoopInfo.h:94
The main scalar evolution driver.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:193
A cache of @llvm.assume calls within a function.
static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch, BasicBlockSet &AftBlocks, T Visit)
BasicBlock * getSuccessor(unsigned i) const
bool hasIterationCountInvariantInParent(Loop *L, ScalarEvolution &SE)
Check inner loop (L) backedge count is known to be invariant on all iterations of its outer loop...
Definition: LoopUtils.cpp:664
STATISTIC(NumFunctions, "Total number of functions")
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:137
DependenceInfo - This class is the main dependence-analysis driver.
static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header, BasicBlock *Latch, Instruction *InsertLoc, BasicBlockSet &AftBlocks)
static bool checkDependencies(SmallVector< Value *, 4 > &Earlier, SmallVector< Value *, 4 > &Later, unsigned LoopDepth, bool InnerLoop, DependenceInfo &DI)
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:268
Simple and conservative implementation of LoopSafetyInfo that can give false-positive answers to its ...
Definition: MustExecute.h:97
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:80
void erase(Loop *L)
Update LoopInfo after removing the last backedge from a loop.
Definition: LoopInfo.cpp:815
BlockT * getHeader() const
Definition: LoopInfo.h:102
auto reverse(ContainerTy &&C, typename std::enable_if< has_rbegin< ContainerTy >::value >::type *=nullptr) -> decltype(make_range(C.rbegin(), C.rend()))
Definition: STLExtras.h:273
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
Definition: LoopInfoImpl.h:270
const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
Definition: BasicBlock.cpp:246
bool isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, DependenceInfo &DI)
This header provides classes for managing per-loop analyses.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:429
Debug location.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:144
unsigned getNumSuccessors() const
Return the number of successors that this instruction has.
void remapInstruction(Instruction *I, ValueToValueMapTy &VMap)
Convert the instruction operands from referencing the current values into those specified by VMap...
Definition: LoopUnroll.cpp:67
The loop was fully unrolled into straight-line code.
void applyUpdates(ArrayRef< UpdateType > Updates)
Inform the dominator tree about a sequence of CFG edge insertions and deletions and perform a batch u...
SmallPtrSet< BasicBlock *, 4 > BasicBlockSet
bool isDebugInfoForProfiling() const
Returns true if we should emit debug info for profiling.
Definition: Metadata.cpp:1508
std::unique_ptr< Dependence > depends(Instruction *Src, Instruction *Dst, bool PossiblyLoopIndependent)
depends - Tests for a dependence between the Src and Dst instructions.
LLVM Basic Block Representation.
Definition: BasicBlock.h:57
Conditional or Unconditional Branch instruction.
DomTreeNodeBase * getIDom() const
LLVM_NODISCARD bool empty() const
Definition: SmallPtrSet.h:91
static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop, BasicBlockSet &ForeBlocks, BasicBlockSet &SubLoopBlocks, BasicBlockSet &AftBlocks, DominatorTree *DT)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:370
bool mayHaveSideEffects() const
Return true if the instruction may have side effects.
Definition: Instruction.h:572
Diagnostic information for applied optimization remarks.
const Instruction & back() const
Definition: BasicBlock.h:282
std::vector< BasicBlock * >::const_reverse_iterator RPOIterator
Definition: LoopIterator.h:101
op_range operands()
Definition: User.h:237
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:381
The loop was partially unrolled – we still have a loop, but with a smaller trip count.
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:569
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
iterator end()
Definition: ValueMap.h:141
size_type size() const
Definition: SmallPtrSet.h:92
bool hasAddressTaken() const
Returns true if there are any uses of this basic block other than direct branches, switches, etc.
Definition: BasicBlock.h:391
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:112
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:417
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file. ...
bool UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, bool AllowExpensiveTripCount, bool UseEpilogRemainder, bool UnrollRemainder, bool ForgetAllSCEV, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, bool PreserveLCSSA, Loop **ResultLoop=nullptr)
Insert code in the prolog/epilog code when unrolling a loop with a run-time trip-count.
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false...
Definition: SmallPtrSet.h:377
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
bool dominates(const Instruction *Def, const Use &U) const
Return true if Def dominates a use in User.
Definition: Dominators.cpp:248
iterator begin() const
Definition: LoopInfo.h:144
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static BranchInst * Create(BasicBlock *IfTrue, Instruction *InsertBefore=nullptr)
#define DEBUG_TYPE
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
Store the result of a depth first search within basic blocks contained by a single loop...
Definition: LoopIterator.h:97
void push_back(pointer val)
Definition: ilist.h:311
BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, DebugInfoFinder *DIFinder=nullptr)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
static void DFS(BasicBlock *Root, SetVector< BasicBlock *> &Set)
LoopT * getParentLoop() const
Definition: LoopInfo.h:103
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
Definition: LoopInfo.h:133
bool isLoopSimplifyForm() const
Return true if the Loop is in the form that the LoopSimplify form transforms loops to...
Definition: LoopInfo.cpp:425
void registerAssumption(CallInst *CI)
Add an @llvm.assume intrinsic to this function&#39;s cache.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
iterator begin() const
Definition: SmallPtrSet.h:396
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:55
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:510
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:151
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:106
#define I(x, y, z)
Definition: MD5.cpp:58
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:332
const BasicBlockListType & getBasicBlockList() const
Get the underlying elements of the Function...
Definition: Function.h:657
block_iterator block_end() const
Definition: LoopInfo.h:157
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:324
bool isUnconditional() const
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM Value Representation.
Definition: Value.h:72
The loop was not modified.
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
Definition: Instruction.cpp:86
virtual bool anyBlockMayThrow() const
Returns true iff any block of the loop for which this info is contains an instruction that may throw ...
Definition: MustExecute.cpp:40
#define LLVM_DEBUG(X)
Definition: Debug.h:122
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:158
block_iterator block_begin() const
Definition: LoopInfo.h:156
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
Definition: LoopInfoImpl.h:49
The optimization diagnostic interface.
iterator begin()
Definition: ValueMap.h:140
const BasicBlock * getParent() const
Definition: Instruction.h:66
LoopUnrollResult
Represents the result of a UnrollLoop invocation.
Definition: UnrollLoop.h:53
LoopUnrollResult UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple, bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop=nullptr)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:532
void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC)
Perform some cleanup and simplifications on loops after unrolling.
Definition: LoopUnroll.cpp:199