LLVM  9.0.0svn
LoopUnrollAndJam.cpp
Go to the documentation of this file.
1 //===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements loop unroll and jam as a routine, much like
10 // LoopUnroll.cpp implements loop unroll.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/ADT/SmallPtrSet.h"
15 #include "llvm/ADT/Statistic.h"
21 #include "llvm/Analysis/LoopPass.h"
26 #include "llvm/IR/BasicBlock.h"
27 #include "llvm/IR/DataLayout.h"
29 #include "llvm/IR/Dominators.h"
30 #include "llvm/IR/IntrinsicInst.h"
31 #include "llvm/IR/LLVMContext.h"
32 #include "llvm/Support/Debug.h"
40 using namespace llvm;
41 
42 #define DEBUG_TYPE "loop-unroll-and-jam"
43 
44 STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
45 STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
46 
48 
49 // Partition blocks in an outer/inner loop pair into blocks before and after
50 // the loop
51 static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
52  BasicBlockSet &ForeBlocks,
53  BasicBlockSet &SubLoopBlocks,
54  BasicBlockSet &AftBlocks,
55  DominatorTree *DT) {
56  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
57  SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
58 
59  for (BasicBlock *BB : L->blocks()) {
60  if (!SubLoop->contains(BB)) {
61  if (DT->dominates(SubLoopLatch, BB))
62  AftBlocks.insert(BB);
63  else
64  ForeBlocks.insert(BB);
65  }
66  }
67 
68  // Check that all blocks in ForeBlocks together dominate the subloop
69  // TODO: This might ideally be done better with a dominator/postdominators.
70  BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
71  for (BasicBlock *BB : ForeBlocks) {
72  if (BB == SubLoopPreHeader)
73  continue;
74  Instruction *TI = BB->getTerminator();
75  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
76  if (!ForeBlocks.count(TI->getSuccessor(i)))
77  return false;
78  }
79 
80  return true;
81 }
82 
83 // Looks at the phi nodes in Header for values coming from Latch. For these
84 // instructions and all their operands calls Visit on them, keeping going for
85 // all the operands in AftBlocks. Returns false if Visit returns false,
86 // otherwise returns true. This is used to process the instructions in the
87 // Aft blocks that need to be moved before the subloop. It is used in two
88 // places. One to check that the required set of instructions can be moved
89 // before the loop. Then to collect the instructions to actually move in
90 // moveHeaderPhiOperandsToForeBlocks.
91 template <typename T>
92 static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,
93  BasicBlockSet &AftBlocks, T Visit) {
95  for (auto &Phi : Header->phis()) {
96  Value *V = Phi.getIncomingValueForBlock(Latch);
97  if (Instruction *I = dyn_cast<Instruction>(V))
98  Worklist.push_back(I);
99  }
100 
101  while (!Worklist.empty()) {
102  Instruction *I = Worklist.back();
103  Worklist.pop_back();
104  if (!Visit(I))
105  return false;
106 
107  if (AftBlocks.count(I->getParent()))
108  for (auto &U : I->operands())
109  if (Instruction *II = dyn_cast<Instruction>(U))
110  Worklist.push_back(II);
111  }
112 
113  return true;
114 }
115 
116 // Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
118  BasicBlock *Latch,
119  Instruction *InsertLoc,
120  BasicBlockSet &AftBlocks) {
121  // We need to ensure we move the instructions in the correct order,
122  // starting with the earliest required instruction and moving forward.
123  std::vector<Instruction *> Visited;
124  processHeaderPhiOperands(Header, Latch, AftBlocks,
125  [&Visited, &AftBlocks](Instruction *I) {
126  if (AftBlocks.count(I->getParent()))
127  Visited.push_back(I);
128  return true;
129  });
130 
131  // Move all instructions in program order to before the InsertLoc
132  BasicBlock *InsertLocBB = InsertLoc->getParent();
133  for (Instruction *I : reverse(Visited)) {
134  if (I->getParent() != InsertLocBB)
135  I->moveBefore(InsertLoc);
136  }
137 }
138 
139 /*
140  This method performs Unroll and Jam. For a simple loop like:
141  for (i = ..)
142  Fore(i)
143  for (j = ..)
144  SubLoop(i, j)
145  Aft(i)
146 
147  Instead of doing normal inner or outer unrolling, we do:
148  for (i = .., i+=2)
149  Fore(i)
150  Fore(i+1)
151  for (j = ..)
152  SubLoop(i, j)
153  SubLoop(i+1, j)
154  Aft(i)
155  Aft(i+1)
156 
157  So the outer loop is essetially unrolled and then the inner loops are fused
158  ("jammed") together into a single loop. This can increase speed when there
159  are loads in SubLoop that are invariant to i, as they become shared between
160  the now jammed inner loops.
161 
162  We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
163  Fore blocks are those before the inner loop, Aft are those after. Normal
164  Unroll code is used to copy each of these sets of blocks and the results are
165  combined together into the final form above.
166 
167  isSafeToUnrollAndJam should be used prior to calling this to make sure the
168  unrolling will be valid. Checking profitablility is also advisable.
169 
170  If EpilogueLoop is non-null, it receives the epilogue loop (if it was
171  necessary to create one and not fully unrolled).
172 */
174  Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple,
175  bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
176  AssumptionCache *AC, OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) {
177 
178  // When we enter here we should have already checked that it is safe
179  BasicBlock *Header = L->getHeader();
180  assert(L->getSubLoops().size() == 1);
181  Loop *SubLoop = *L->begin();
182 
183  // Don't enter the unroll code if there is nothing to do.
184  if (TripCount == 0 && Count < 2) {
185  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n");
187  }
188 
189  assert(Count > 0);
190  assert(TripMultiple > 0);
191  assert(TripCount == 0 || TripCount % TripMultiple == 0);
192 
193  // Are we eliminating the loop control altogether?
194  bool CompletelyUnroll = (Count == TripCount);
195 
196  // We use the runtime remainder in cases where we don't know trip multiple
197  if (TripMultiple == 1 || TripMultiple % Count != 0) {
198  if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
199  /*UseEpilogRemainder*/ true,
200  UnrollRemainder, LI, SE, DT, AC, true,
201  EpilogueLoop)) {
202  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
203  "generated when assuming runtime trip count\n");
205  }
206  }
207 
208  // Notify ScalarEvolution that the loop will be substantially changed,
209  // if not outright eliminated.
210  if (SE) {
211  SE->forgetLoop(L);
212  SE->forgetLoop(SubLoop);
213  }
214 
215  using namespace ore;
216  // Report the unrolling decision.
217  if (CompletelyUnroll) {
218  LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
219  << Header->getName() << " with trip count " << TripCount
220  << "!\n");
221  ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
222  L->getHeader())
223  << "completely unroll and jammed loop with "
224  << NV("UnrollCount", TripCount) << " iterations");
225  } else {
226  auto DiagBuilder = [&]() {
227  OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
228  L->getHeader());
229  return Diag << "unroll and jammed loop by a factor of "
230  << NV("UnrollCount", Count);
231  };
232 
233  LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
234  << " by " << Count);
235  if (TripMultiple != 1) {
236  LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
237  ORE->emit([&]() {
238  return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
239  << " trips per branch";
240  });
241  } else {
242  LLVM_DEBUG(dbgs() << " with run-time trip count");
243  ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
244  }
245  LLVM_DEBUG(dbgs() << "!\n");
246  }
247 
248  BasicBlock *Preheader = L->getLoopPreheader();
249  BasicBlock *LatchBlock = L->getLoopLatch();
250  BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
251  assert(Preheader && LatchBlock && Header);
252  assert(BI && !BI->isUnconditional());
253  bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
254  BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
255  bool SubLoopContinueOnTrue = SubLoop->contains(
256  SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
257 
258  // Partition blocks in an outer/inner loop pair into blocks before and after
259  // the loop
260  BasicBlockSet SubLoopBlocks;
261  BasicBlockSet ForeBlocks;
262  BasicBlockSet AftBlocks;
263  partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
264  DT);
265 
266  // We keep track of the entering/first and exiting/last block of each of
267  // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
268  // blocks easier.
269  std::vector<BasicBlock *> ForeBlocksFirst;
270  std::vector<BasicBlock *> ForeBlocksLast;
271  std::vector<BasicBlock *> SubLoopBlocksFirst;
272  std::vector<BasicBlock *> SubLoopBlocksLast;
273  std::vector<BasicBlock *> AftBlocksFirst;
274  std::vector<BasicBlock *> AftBlocksLast;
275  ForeBlocksFirst.push_back(Header);
276  ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
277  SubLoopBlocksFirst.push_back(SubLoop->getHeader());
278  SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
279  AftBlocksFirst.push_back(SubLoop->getExitBlock());
280  AftBlocksLast.push_back(L->getExitingBlock());
281  // Maps Blocks[0] -> Blocks[It]
282  ValueToValueMapTy LastValueMap;
283 
284  // Move any instructions from fore phi operands from AftBlocks into Fore.
286  Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(),
287  AftBlocks);
288 
289  // The current on-the-fly SSA update requires blocks to be processed in
290  // reverse postorder so that LastValueMap contains the correct value at each
291  // exit.
292  LoopBlocksDFS DFS(L);
293  DFS.perform(LI);
294  // Stash the DFS iterators before adding blocks to the loop.
295  LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
296  LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
297 
298  if (Header->getParent()->isDebugInfoForProfiling())
299  for (BasicBlock *BB : L->getBlocks())
300  for (Instruction &I : *BB)
301  if (!isa<DbgInfoIntrinsic>(&I))
302  if (const DILocation *DIL = I.getDebugLoc()) {
303  auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(Count);
304  if (NewDIL)
305  I.setDebugLoc(NewDIL.getValue());
306  else
307  LLVM_DEBUG(dbgs()
308  << "Failed to create new discriminator: "
309  << DIL->getFilename() << " Line: " << DIL->getLine());
310  }
311 
312  // Copy all blocks
313  for (unsigned It = 1; It != Count; ++It) {
314  std::vector<BasicBlock *> NewBlocks;
315  // Maps Blocks[It] -> Blocks[It-1]
316  DenseMap<Value *, Value *> PrevItValueMap;
317 
318  for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
319  ValueToValueMapTy VMap;
320  BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
321  Header->getParent()->getBasicBlockList().push_back(New);
322 
323  if (ForeBlocks.count(*BB)) {
324  L->addBasicBlockToLoop(New, *LI);
325 
326  if (*BB == ForeBlocksFirst[0])
327  ForeBlocksFirst.push_back(New);
328  if (*BB == ForeBlocksLast[0])
329  ForeBlocksLast.push_back(New);
330  } else if (SubLoopBlocks.count(*BB)) {
331  SubLoop->addBasicBlockToLoop(New, *LI);
332 
333  if (*BB == SubLoopBlocksFirst[0])
334  SubLoopBlocksFirst.push_back(New);
335  if (*BB == SubLoopBlocksLast[0])
336  SubLoopBlocksLast.push_back(New);
337  } else if (AftBlocks.count(*BB)) {
338  L->addBasicBlockToLoop(New, *LI);
339 
340  if (*BB == AftBlocksFirst[0])
341  AftBlocksFirst.push_back(New);
342  if (*BB == AftBlocksLast[0])
343  AftBlocksLast.push_back(New);
344  } else {
345  llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
346  }
347 
348  // Update our running maps of newest clones
349  PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
350  LastValueMap[*BB] = New;
351  for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
352  VI != VE; ++VI) {
353  PrevItValueMap[VI->second] =
354  const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
355  LastValueMap[VI->first] = VI->second;
356  }
357 
358  NewBlocks.push_back(New);
359 
360  // Update DomTree:
361  if (*BB == ForeBlocksFirst[0])
362  DT->addNewBlock(New, ForeBlocksLast[It - 1]);
363  else if (*BB == SubLoopBlocksFirst[0])
364  DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
365  else if (*BB == AftBlocksFirst[0])
366  DT->addNewBlock(New, AftBlocksLast[It - 1]);
367  else {
368  // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
369  // structure.
370  auto BBDomNode = DT->getNode(*BB);
371  auto BBIDom = BBDomNode->getIDom();
372  BasicBlock *OriginalBBIDom = BBIDom->getBlock();
373  assert(OriginalBBIDom);
374  assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
375  DT->addNewBlock(
376  New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
377  }
378  }
379 
380  // Remap all instructions in the most recent iteration
381  for (BasicBlock *NewBlock : NewBlocks) {
382  for (Instruction &I : *NewBlock) {
383  ::remapInstruction(&I, LastValueMap);
384  if (auto *II = dyn_cast<IntrinsicInst>(&I))
385  if (II->getIntrinsicID() == Intrinsic::assume)
386  AC->registerAssumption(II);
387  }
388  }
389 
390  // Alter the ForeBlocks phi's, pointing them at the latest version of the
391  // value from the previous iteration's phis
392  for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
393  Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
394  assert(OldValue && "should have incoming edge from Aft[It]");
395  Value *NewValue = OldValue;
396  if (Value *PrevValue = PrevItValueMap[OldValue])
397  NewValue = PrevValue;
398 
399  assert(Phi.getNumOperands() == 2);
400  Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
401  Phi.setIncomingValue(0, NewValue);
402  Phi.removeIncomingValue(1);
403  }
404  }
405 
406  // Now that all the basic blocks for the unrolled iterations are in place,
407  // finish up connecting the blocks and phi nodes. At this point LastValueMap
408  // is the last unrolled iterations values.
409 
410  // Update Phis in BB from OldBB to point to NewBB
411  auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB,
412  BasicBlock *NewBB) {
413  for (PHINode &Phi : BB->phis()) {
414  int I = Phi.getBasicBlockIndex(OldBB);
415  Phi.setIncomingBlock(I, NewBB);
416  }
417  };
418  // Update Phis in BB from OldBB to point to NewBB and use the latest value
419  // from LastValueMap
420  auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
421  BasicBlock *NewBB,
422  ValueToValueMapTy &LastValueMap) {
423  for (PHINode &Phi : BB->phis()) {
424  for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
425  if (Phi.getIncomingBlock(b) == OldBB) {
426  Value *OldValue = Phi.getIncomingValue(b);
427  if (Value *LastValue = LastValueMap[OldValue])
428  Phi.setIncomingValue(b, LastValue);
429  Phi.setIncomingBlock(b, NewBB);
430  break;
431  }
432  }
433  }
434  };
435  // Move all the phis from Src into Dest
436  auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
437  Instruction *insertPoint = Dest->getFirstNonPHI();
438  while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
439  Phi->moveBefore(insertPoint);
440  };
441 
442  // Update the PHI values outside the loop to point to the last block
443  updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
444  LastValueMap);
445 
446  // Update ForeBlocks successors and phi nodes
447  BranchInst *ForeTerm =
448  cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
449  BasicBlock *Dest = SubLoopBlocksFirst[0];
450  ForeTerm->setSuccessor(0, Dest);
451 
452  if (CompletelyUnroll) {
453  while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
454  Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
455  Phi->getParent()->getInstList().erase(Phi);
456  }
457  } else {
458  // Update the PHI values to point to the last aft block
459  updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
460  AftBlocksLast.back(), LastValueMap);
461  }
462 
463  for (unsigned It = 1; It != Count; It++) {
464  // Remap ForeBlock successors from previous iteration to this
465  BranchInst *ForeTerm =
466  cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
467  BasicBlock *Dest = ForeBlocksFirst[It];
468  ForeTerm->setSuccessor(0, Dest);
469  }
470 
471  // Subloop successors and phis
472  BranchInst *SubTerm =
473  cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
474  SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
475  SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
476  updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0],
477  ForeBlocksLast.back());
478  updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0],
479  SubLoopBlocksLast.back());
480 
481  for (unsigned It = 1; It != Count; It++) {
482  // Replace the conditional branch of the previous iteration subloop with an
483  // unconditional one to this one
484  BranchInst *SubTerm =
485  cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
486  BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
487  SubTerm->eraseFromParent();
488 
489  updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It],
490  ForeBlocksLast.back());
491  updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It],
492  SubLoopBlocksLast.back());
493  movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
494  }
495 
496  // Aft blocks successors and phis
497  BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
498  if (CompletelyUnroll) {
499  BranchInst::Create(LoopExit, Term);
500  Term->eraseFromParent();
501  } else {
502  Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
503  }
504  updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
505  SubLoopBlocksLast.back());
506 
507  for (unsigned It = 1; It != Count; It++) {
508  // Replace the conditional branch of the previous iteration subloop with an
509  // unconditional one to this one
510  BranchInst *AftTerm =
511  cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
512  BranchInst::Create(AftBlocksFirst[It], AftTerm);
513  AftTerm->eraseFromParent();
514 
515  updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It],
516  SubLoopBlocksLast.back());
517  movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
518  }
519 
520  // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
521  // new ones required.
522  if (Count != 1) {
524  DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
525  SubLoopBlocksFirst[0]);
527  SubLoopBlocksLast[0], AftBlocksFirst[0]);
528 
530  ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
532  SubLoopBlocksLast.back(), AftBlocksFirst[0]);
533  DT->applyUpdates(DTUpdates);
534  }
535 
536  // Merge adjacent basic blocks, if possible.
537  SmallPtrSet<BasicBlock *, 16> MergeBlocks;
538  MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
539  MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
540  MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
541  while (!MergeBlocks.empty()) {
542  BasicBlock *BB = *MergeBlocks.begin();
544  if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) {
545  BasicBlock *Dest = Term->getSuccessor(0);
546  if (BasicBlock *Fold = foldBlockIntoPredecessor(Dest, LI, SE, DT)) {
547  // Don't remove BB and add Fold as they are the same BB
548  assert(Fold == BB);
549  (void)Fold;
550  MergeBlocks.erase(Dest);
551  } else
552  MergeBlocks.erase(BB);
553  } else
554  MergeBlocks.erase(BB);
555  }
556 
557  // At this point, the code is well formed. We now do a quick sweep over the
558  // inserted code, doing constant propagation and dead code elimination as we
559  // go.
560  simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC);
561  simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC);
562 
563  NumCompletelyUnrolledAndJammed += CompletelyUnroll;
564  ++NumUnrolledAndJammed;
565 
566 #ifndef NDEBUG
567  // We shouldn't have done anything to break loop simplify form or LCSSA.
568  Loop *OuterL = L->getParentLoop();
569  Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop);
570  assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
571  if (!CompletelyUnroll)
573  assert(SubLoop->isLoopSimplifyForm());
574  assert(DT->verify());
575 #endif
576 
577  // Update LoopInfo if the loop is completely removed.
578  if (CompletelyUnroll)
579  LI->erase(L);
580 
581  return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
583 }
584 
585 static bool getLoadsAndStores(BasicBlockSet &Blocks,
586  SmallVector<Value *, 4> &MemInstr) {
587  // Scan the BBs and collect legal loads and stores.
588  // Returns false if non-simple loads/stores are found.
589  for (BasicBlock *BB : Blocks) {
590  for (Instruction &I : *BB) {
591  if (auto *Ld = dyn_cast<LoadInst>(&I)) {
592  if (!Ld->isSimple())
593  return false;
594  MemInstr.push_back(&I);
595  } else if (auto *St = dyn_cast<StoreInst>(&I)) {
596  if (!St->isSimple())
597  return false;
598  MemInstr.push_back(&I);
599  } else if (I.mayReadOrWriteMemory()) {
600  return false;
601  }
602  }
603  }
604  return true;
605 }
606 
609  unsigned LoopDepth, bool InnerLoop,
610  DependenceInfo &DI) {
611  // Use DA to check for dependencies between loads and stores that make unroll
612  // and jam invalid
613  for (Value *I : Earlier) {
614  for (Value *J : Later) {
615  Instruction *Src = cast<Instruction>(I);
616  Instruction *Dst = cast<Instruction>(J);
617  if (Src == Dst)
618  continue;
619  // Ignore Input dependencies.
620  if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
621  continue;
622 
623  // Track dependencies, and if we find them take a conservative approach
624  // by allowing only = or < (not >), altough some > would be safe
625  // (depending upon unroll width).
626  // For the inner loop, we need to disallow any (> <) dependencies
627  // FIXME: Allow > so long as distance is less than unroll width
628  if (auto D = DI.depends(Src, Dst, true)) {
629  assert(D->isOrdered() && "Expected an output, flow or anti dep.");
630 
631  if (D->isConfused()) {
632  LLVM_DEBUG(dbgs() << " Confused dependency between:\n"
633  << " " << *Src << "\n"
634  << " " << *Dst << "\n");
635  return false;
636  }
637  if (!InnerLoop) {
638  if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT) {
639  LLVM_DEBUG(dbgs() << " > dependency between:\n"
640  << " " << *Src << "\n"
641  << " " << *Dst << "\n");
642  return false;
643  }
644  } else {
645  assert(LoopDepth + 1 <= D->getLevels());
646  if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT &&
647  D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT) {
648  LLVM_DEBUG(dbgs() << " < > dependency between:\n"
649  << " " << *Src << "\n"
650  << " " << *Dst << "\n");
651  return false;
652  }
653  }
654  }
655  }
656  }
657  return true;
658 }
659 
660 static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks,
661  BasicBlockSet &SubLoopBlocks,
662  BasicBlockSet &AftBlocks, DependenceInfo &DI) {
663  // Get all loads/store pairs for each blocks
664  SmallVector<Value *, 4> ForeMemInstr;
665  SmallVector<Value *, 4> SubLoopMemInstr;
666  SmallVector<Value *, 4> AftMemInstr;
667  if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) ||
668  !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) ||
669  !getLoadsAndStores(AftBlocks, AftMemInstr))
670  return false;
671 
672  // Check for dependencies between any blocks that may change order
673  unsigned LoopDepth = L->getLoopDepth();
674  return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false,
675  DI) &&
676  checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) &&
677  checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false,
678  DI) &&
679  checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true,
680  DI);
681 }
682 
684  DependenceInfo &DI) {
685  /* We currently handle outer loops like this:
686  |
687  ForeFirst <----\ }
688  Blocks | } ForeBlocks
689  ForeLast | }
690  | |
691  SubLoopFirst <\ | }
692  Blocks | | } SubLoopBlocks
693  SubLoopLast -/ | }
694  | |
695  AftFirst | }
696  Blocks | } AftBlocks
697  AftLast ------/ }
698  |
699 
700  There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
701  and AftBlocks, providing that there is one edge from Fores to SubLoops,
702  one edge from SubLoops to Afts and a single outer loop exit (from Afts).
703  In practice we currently limit Aft blocks to a single block, and limit
704  things further in the profitablility checks of the unroll and jam pass.
705 
706  Because of the way we rearrange basic blocks, we also require that
707  the Fore blocks on all unrolled iterations are safe to move before the
708  SubLoop blocks of all iterations. So we require that the phi node looping
709  operands of ForeHeader can be moved to at least the end of ForeEnd, so that
710  we can arrange cloned Fore Blocks before the subloop and match up Phi's
711  correctly.
712 
713  i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2.
714  It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2.
715 
716  There are then a number of checks along the lines of no calls, no
717  exceptions, inner loop IV is consistent, etc. Note that for loops requiring
718  runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
719  UnrollAndJamLoop if the trip count cannot be easily calculated.
720  */
721 
722  if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
723  return false;
724  Loop *SubLoop = L->getSubLoops()[0];
725  if (!SubLoop->isLoopSimplifyForm())
726  return false;
727 
728  BasicBlock *Header = L->getHeader();
729  BasicBlock *Latch = L->getLoopLatch();
730  BasicBlock *Exit = L->getExitingBlock();
731  BasicBlock *SubLoopHeader = SubLoop->getHeader();
732  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
733  BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
734 
735  if (Latch != Exit)
736  return false;
737  if (SubLoopLatch != SubLoopExit)
738  return false;
739 
740  if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken()) {
741  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");
742  return false;
743  }
744 
745  // Split blocks into Fore/SubLoop/Aft based on dominators
746  BasicBlockSet SubLoopBlocks;
747  BasicBlockSet ForeBlocks;
748  BasicBlockSet AftBlocks;
749  if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks,
750  AftBlocks, &DT)) {
751  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n");
752  return false;
753  }
754 
755  // Aft blocks may need to move instructions to fore blocks, which becomes more
756  // difficult if there are multiple (potentially conditionally executed)
757  // blocks. For now we just exclude loops with multiple aft blocks.
758  if (AftBlocks.size() != 1) {
759  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle "
760  "multiple blocks after the loop\n");
761  return false;
762  }
763 
764  // Check inner loop backedge count is consistent on all iterations of the
765  // outer loop
766  if (!hasIterationCountInvariantInParent(SubLoop, SE)) {
767  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is "
768  "not consistent on each iteration\n");
769  return false;
770  }
771 
772  // Check the loop safety info for exceptions.
774  LSI.computeLoopSafetyInfo(L);
775  if (LSI.anyBlockMayThrow()) {
776  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n");
777  return false;
778  }
779 
780  // We've ruled out the easy stuff and now need to check that there are no
781  // interdependencies which may prevent us from moving the:
782  // ForeBlocks before Subloop and AftBlocks.
783  // Subloop before AftBlocks.
784  // ForeBlock phi operands before the subloop
785 
786  // Make sure we can move all instructions we need to before the subloop
788  Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {
789  if (SubLoop->contains(I->getParent()))
790  return false;
791  if (AftBlocks.count(I->getParent())) {
792  // If we hit a phi node in afts we know we are done (probably
793  // LCSSA)
794  if (isa<PHINode>(I))
795  return false;
796  // Can't move instructions with side effects or memory
797  // reads/writes
798  if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
799  return false;
800  }
801  // Keep going
802  return true;
803  })) {
804  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required "
805  "instructions after subloop to before it\n");
806  return false;
807  }
808 
809  // Check for memory dependencies which prohibit the unrolling we are doing.
810  // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
811  // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
812  if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI)) {
813  LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n");
814  return false;
815  }
816 
817  return true;
818 }
virtual void computeLoopSafetyInfo(const Loop *CurLoop)
Computes safety information for a loop checks loop body & header for the possibility of may throw exc...
Definition: MustExecute.cpp:44
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks &#39;this&#39; from the containing basic block and deletes it.
Definition: Instruction.cpp:67
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
Definition: LoopInfoImpl.h:224
bool UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, bool AllowExpensiveTripCount, bool UseEpilogRemainder, bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, bool PreserveLCSSA, Loop **ResultLoop=nullptr)
Insert code in the prolog/epilog code when unrolling a loop with a run-time trip-count.
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:249
DiagnosticInfoOptimizationBase::Argument NV
This class represents lattice values for constants.
Definition: AllocatorList.h:23
BasicBlock * getSuccessor(unsigned Idx) const
Return the specified successor. This instruction must be a terminator.
static bool getLoadsAndStores(BasicBlockSet &Blocks, SmallVector< Value *, 4 > &MemInstr)
bool isRecursivelyLCSSAForm(DominatorTree &DT, const LoopInfo &LI) const
Return true if this Loop and all inner subloops are in LCSSA form.
Definition: LoopInfo.cpp:183
unsigned getLoopDepth() const
Return the nesting level of this loop.
Definition: LoopInfo.h:91
The main scalar evolution driver.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:173
A cache of @llvm.assume calls within a function.
BasicBlock * foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT)
Folds a basic block into its predecessor if it only has one predecessor, and that predecessor only ha...
Definition: LoopUnroll.cpp:99
static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch, BasicBlockSet &AftBlocks, T Visit)
BasicBlock * getSuccessor(unsigned i) const
bool hasIterationCountInvariantInParent(Loop *L, ScalarEvolution &SE)
Check inner loop (L) backedge count is known to be invariant on all iterations of its outer loop...
Definition: LoopUtils.cpp:651
STATISTIC(NumFunctions, "Total number of functions")
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:137
DependenceInfo - This class is the main dependence-analysis driver.
static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header, BasicBlock *Latch, Instruction *InsertLoc, BasicBlockSet &AftBlocks)
static bool checkDependencies(SmallVector< Value *, 4 > &Earlier, SmallVector< Value *, 4 > &Later, unsigned LoopDepth, bool InnerLoop, DependenceInfo &DI)
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:268
Simple and conservative implementation of LoopSafetyInfo that can give false-positive answers to its ...
Definition: MustExecute.h:97
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:80
void erase(Loop *L)
Update LoopInfo after removing the last backedge from a loop.
Definition: LoopInfo.cpp:588
BlockT * getHeader() const
Definition: LoopInfo.h:99
auto reverse(ContainerTy &&C, typename std::enable_if< has_rbegin< ContainerTy >::value >::type *=nullptr) -> decltype(make_range(C.rbegin(), C.rend()))
Definition: STLExtras.h:266
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
Definition: LoopInfoImpl.h:250
bool isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, DependenceInfo &DI)
This header provides classes for managing per-loop analyses.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:429
Debug location.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:144
unsigned getNumSuccessors() const
Return the number of successors that this instruction has.
void remapInstruction(Instruction *I, ValueToValueMapTy &VMap)
Convert the instruction operands from referencing the current values into those specified by VMap...
Definition: LoopUnroll.cpp:65
The loop was fully unrolled into straight-line code.
void applyUpdates(ArrayRef< UpdateType > Updates)
Inform the dominator tree about a sequence of CFG edge insertions and deletions and perform a batch u...
SmallPtrSet< BasicBlock *, 4 > BasicBlockSet
bool isDebugInfoForProfiling() const
Returns true if we should emit debug info for profiling.
Definition: Metadata.cpp:1511
std::unique_ptr< Dependence > depends(Instruction *Src, Instruction *Dst, bool PossiblyLoopIndependent)
depends - Tests for a dependence between the Src and Dst instructions.
LLVM Basic Block Representation.
Definition: BasicBlock.h:57
Conditional or Unconditional Branch instruction.
DomTreeNodeBase * getIDom() const
LLVM_NODISCARD bool empty() const
Definition: SmallPtrSet.h:91
static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop, BasicBlockSet &ForeBlocks, BasicBlockSet &SubLoopBlocks, BasicBlockSet &AftBlocks, DominatorTree *DT)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:370
bool mayHaveSideEffects() const
Return true if the instruction may have side effects.
Definition: Instruction.h:575
Diagnostic information for applied optimization remarks.
const Instruction & back() const
Definition: BasicBlock.h:282
std::vector< BasicBlock * >::const_reverse_iterator RPOIterator
Definition: LoopIterator.h:101
op_range operands()
Definition: User.h:237
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:381
The loop was partially unrolled – we still have a loop, but with a smaller trip count.
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:342
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
iterator end()
Definition: ValueMap.h:141
size_type size() const
Definition: SmallPtrSet.h:92
bool hasAddressTaken() const
Returns true if there are any uses of this basic block other than direct branches, switches, etc.
Definition: BasicBlock.h:391
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:109
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:417
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file. ...
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false...
Definition: SmallPtrSet.h:377
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:839
bool dominates(const Instruction *Def, const Use &U) const
Return true if Def dominates a use in User.
Definition: Dominators.cpp:248
iterator begin() const
Definition: LoopInfo.h:141
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static BranchInst * Create(BasicBlock *IfTrue, Instruction *InsertBefore=nullptr)
#define DEBUG_TYPE
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
Store the result of a depth first search within basic blocks contained by a single loop...
Definition: LoopIterator.h:97
void push_back(pointer val)
Definition: ilist.h:311
BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, DebugInfoFinder *DIFinder=nullptr)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
static void DFS(BasicBlock *Root, SetVector< BasicBlock *> &Set)
LoopT * getParentLoop() const
Definition: LoopInfo.h:100
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
Definition: LoopInfo.h:130
bool isLoopSimplifyForm() const
Return true if the Loop is in the form that the LoopSimplify form transforms loops to...
Definition: LoopInfo.cpp:192
void registerAssumption(CallInst *CI)
Add an @llvm.assume intrinsic to this function&#39;s cache.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
iterator begin() const
Definition: SmallPtrSet.h:396
void emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:644
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:55
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:464
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:148
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:106
#define I(x, y, z)
Definition: MD5.cpp:58
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:322
const BasicBlockListType & getBasicBlockList() const
Get the underlying elements of the Function...
Definition: Function.h:632
block_iterator block_end() const
Definition: LoopInfo.h:154
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:324
bool isUnconditional() const
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM Value Representation.
Definition: Value.h:72
The loop was not modified.
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
Definition: Instruction.cpp:86
virtual bool anyBlockMayThrow() const
Returns true iff any block of the loop for which this info is contains an instruction that may throw ...
Definition: MustExecute.cpp:40
#define LLVM_DEBUG(X)
Definition: Debug.h:122
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:155
block_iterator block_begin() const
Definition: LoopInfo.h:153
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
Definition: LoopInfoImpl.h:49
The optimization diagnostic interface.
iterator begin()
Definition: ValueMap.h:140
const BasicBlock * getParent() const
Definition: Instruction.h:66
LoopUnrollResult
Represents the result of a UnrollLoop invocation.
Definition: UnrollLoop.h:51
LoopUnrollResult UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple, bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop=nullptr)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:535
void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC)
Perform some cleanup and simplifications on loops after unrolling.
Definition: LoopUnroll.cpp:257