LLVM  3.7.0
LoopDistribute.cpp
Go to the documentation of this file.
1 //===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements the Loop Distribution Pass. Its main focus is to
11 // distribute loops that cannot be vectorized due to dependence cycles. It
12 // tries to isolate the offending dependences into a new loop allowing
13 // vectorization of the remaining parts.
14 //
15 // For dependence analysis, the pass uses the LoopVectorizer's
16 // LoopAccessAnalysis. Because this analysis presumes no change in the order of
17 // memory operations, special care is taken to preserve the lexical order of
18 // these operations.
19 //
20 // Similarly to the Vectorizer, the pass also supports loop versioning to
21 // run-time disambiguate potentially overlapping arrays.
22 //
23 //===----------------------------------------------------------------------===//
24 
27 #include "llvm/ADT/STLExtras.h"
28 #include "llvm/ADT/Statistic.h"
30 #include "llvm/Analysis/LoopInfo.h"
31 #include "llvm/IR/Dominators.h"
32 #include "llvm/Pass.h"
34 #include "llvm/Support/Debug.h"
38 #include <list>
39 
40 #define LDIST_NAME "loop-distribute"
41 #define DEBUG_TYPE LDIST_NAME
42 
43 using namespace llvm;
44 
45 static cl::opt<bool>
46  LDistVerify("loop-distribute-verify", cl::Hidden,
47  cl::desc("Turn on DominatorTree and LoopInfo verification "
48  "after Loop Distribution"),
49  cl::init(false));
50 
52  "loop-distribute-non-if-convertible", cl::Hidden,
53  cl::desc("Whether to distribute into a loop that may not be "
54  "if-convertible by the loop vectorizer"),
55  cl::init(false));
56 
57 STATISTIC(NumLoopsDistributed, "Number of loops distributed");
58 
59 namespace {
60 /// \brief Maintains the set of instructions of the loop for a partition before
61 /// cloning. After cloning, it hosts the new loop.
62 class InstPartition {
63  typedef SmallPtrSet<Instruction *, 8> InstructionSet;
64 
65 public:
66  InstPartition(Instruction *I, Loop *L, bool DepCycle = false)
67  : DepCycle(DepCycle), OrigLoop(L), ClonedLoop(nullptr) {
68  Set.insert(I);
69  }
70 
71  /// \brief Returns whether this partition contains a dependence cycle.
72  bool hasDepCycle() const { return DepCycle; }
73 
74  /// \brief Adds an instruction to this partition.
75  void add(Instruction *I) { Set.insert(I); }
76 
77  /// \brief Collection accessors.
78  InstructionSet::iterator begin() { return Set.begin(); }
79  InstructionSet::iterator end() { return Set.end(); }
80  InstructionSet::const_iterator begin() const { return Set.begin(); }
81  InstructionSet::const_iterator end() const { return Set.end(); }
82  bool empty() const { return Set.empty(); }
83 
84  /// \brief Moves this partition into \p Other. This partition becomes empty
85  /// after this.
86  void moveTo(InstPartition &Other) {
87  Other.Set.insert(Set.begin(), Set.end());
88  Set.clear();
89  Other.DepCycle |= DepCycle;
90  }
91 
92  /// \brief Populates the partition with a transitive closure of all the
93  /// instructions that the seeded instructions dependent on.
94  void populateUsedSet() {
95  // FIXME: We currently don't use control-dependence but simply include all
96  // blocks (possibly empty at the end) and let simplifycfg mostly clean this
97  // up.
98  for (auto *B : OrigLoop->getBlocks())
99  Set.insert(B->getTerminator());
100 
101  // Follow the use-def chains to form a transitive closure of all the
102  // instructions that the originally seeded instructions depend on.
103  SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end());
104  while (!Worklist.empty()) {
105  Instruction *I = Worklist.pop_back_val();
106  // Insert instructions from the loop that we depend on.
107  for (Value *V : I->operand_values()) {
108  auto *I = dyn_cast<Instruction>(V);
109  if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second)
110  Worklist.push_back(I);
111  }
112  }
113  }
114 
115  /// \brief Clones the original loop.
116  ///
117  /// Updates LoopInfo and DominatorTree using the information that block \p
118  /// LoopDomBB dominates the loop.
119  Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB,
120  unsigned Index, LoopInfo *LI,
121  DominatorTree *DT) {
122  ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop,
123  VMap, Twine(".ldist") + Twine(Index),
124  LI, DT, ClonedLoopBlocks);
125  return ClonedLoop;
126  }
127 
128  /// \brief The cloned loop. If this partition is mapped to the original loop,
129  /// this is null.
130  const Loop *getClonedLoop() const { return ClonedLoop; }
131 
132  /// \brief Returns the loop where this partition ends up after distribution.
133  /// If this partition is mapped to the original loop then use the block from
134  /// the loop.
135  const Loop *getDistributedLoop() const {
136  return ClonedLoop ? ClonedLoop : OrigLoop;
137  }
138 
139  /// \brief The VMap that is populated by cloning and then used in
140  /// remapinstruction to remap the cloned instructions.
141  ValueToValueMapTy &getVMap() { return VMap; }
142 
143  /// \brief Remaps the cloned instructions using VMap.
144  void remapInstructions() {
145  remapInstructionsInBlocks(ClonedLoopBlocks, VMap);
146  }
147 
148  /// \brief Based on the set of instructions selected for this partition,
149  /// removes the unnecessary ones.
150  void removeUnusedInsts() {
152 
153  for (auto *Block : OrigLoop->getBlocks())
154  for (auto &Inst : *Block)
155  if (!Set.count(&Inst)) {
156  Instruction *NewInst = &Inst;
157  if (!VMap.empty())
158  NewInst = cast<Instruction>(VMap[NewInst]);
159 
160  assert(!isa<BranchInst>(NewInst) &&
161  "Branches are marked used early on");
162  Unused.push_back(NewInst);
163  }
164 
165  // Delete the instructions backwards, as it has a reduced likelihood of
166  // having to update as many def-use and use-def chains.
167  for (auto I = Unused.rbegin(), E = Unused.rend(); I != E; ++I) {
168  auto *Inst = *I;
169 
170  if (!Inst->use_empty())
171  Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
172  Inst->eraseFromParent();
173  }
174  }
175 
176  void print() const {
177  if (DepCycle)
178  dbgs() << " (cycle)\n";
179  for (auto *I : Set)
180  // Prefix with the block name.
181  dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n";
182  }
183 
184  void printBlocks() const {
185  for (auto *BB : getDistributedLoop()->getBlocks())
186  dbgs() << *BB;
187  }
188 
189 private:
190  /// \brief Instructions from OrigLoop selected for this partition.
191  InstructionSet Set;
192 
193  /// \brief Whether this partition contains a dependence cycle.
194  bool DepCycle;
195 
196  /// \brief The original loop.
197  Loop *OrigLoop;
198 
199  /// \brief The cloned loop. If this partition is mapped to the original loop,
200  /// this is null.
201  Loop *ClonedLoop;
202 
203  /// \brief The blocks of ClonedLoop including the preheader. If this
204  /// partition is mapped to the original loop, this is empty.
205  SmallVector<BasicBlock *, 8> ClonedLoopBlocks;
206 
207  /// \brief These gets populated once the set of instructions have been
208  /// finalized. If this partition is mapped to the original loop, these are not
209  /// set.
210  ValueToValueMapTy VMap;
211 };
212 
213 /// \brief Holds the set of Partitions. It populates them, merges them and then
214 /// clones the loops.
215 class InstPartitionContainer {
216  typedef DenseMap<Instruction *, int> InstToPartitionIdT;
217 
218 public:
219  InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT)
220  : L(L), LI(LI), DT(DT) {}
221 
222  /// \brief Returns the number of partitions.
223  unsigned getSize() const { return PartitionContainer.size(); }
224 
225  /// \brief Adds \p Inst into the current partition if that is marked to
226  /// contain cycles. Otherwise start a new partition for it.
227  void addToCyclicPartition(Instruction *Inst) {
228  // If the current partition is non-cyclic. Start a new one.
229  if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle())
230  PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true);
231  else
232  PartitionContainer.back().add(Inst);
233  }
234 
235  /// \brief Adds \p Inst into a partition that is not marked to contain
236  /// dependence cycles.
237  ///
238  // Initially we isolate memory instructions into as many partitions as
239  // possible, then later we may merge them back together.
240  void addToNewNonCyclicPartition(Instruction *Inst) {
241  PartitionContainer.emplace_back(Inst, L);
242  }
243 
244  /// \brief Merges adjacent non-cyclic partitions.
245  ///
246  /// The idea is that we currently only want to isolate the non-vectorizable
247  /// partition. We could later allow more distribution among these partition
248  /// too.
249  void mergeAdjacentNonCyclic() {
250  mergeAdjacentPartitionsIf(
251  [](const InstPartition *P) { return !P->hasDepCycle(); });
252  }
253 
254  /// \brief If a partition contains only conditional stores, we won't vectorize
255  /// it. Try to merge it with a previous cyclic partition.
256  void mergeNonIfConvertible() {
257  mergeAdjacentPartitionsIf([&](const InstPartition *Partition) {
258  if (Partition->hasDepCycle())
259  return true;
260 
261  // Now, check if all stores are conditional in this partition.
262  bool seenStore = false;
263 
264  for (auto *Inst : *Partition)
265  if (isa<StoreInst>(Inst)) {
266  seenStore = true;
268  return false;
269  }
270  return seenStore;
271  });
272  }
273 
274  /// \brief Merges the partitions according to various heuristics.
275  void mergeBeforePopulating() {
276  mergeAdjacentNonCyclic();
278  mergeNonIfConvertible();
279  }
280 
281  /// \brief Merges partitions in order to ensure that no loads are duplicated.
282  ///
283  /// We can't duplicate loads because that could potentially reorder them.
284  /// LoopAccessAnalysis provides dependency information with the context that
285  /// the order of memory operation is preserved.
286  ///
287  /// Return if any partitions were merged.
288  bool mergeToAvoidDuplicatedLoads() {
289  typedef DenseMap<Instruction *, InstPartition *> LoadToPartitionT;
290  typedef EquivalenceClasses<InstPartition *> ToBeMergedT;
291 
292  LoadToPartitionT LoadToPartition;
293  ToBeMergedT ToBeMerged;
294 
295  // Step through the partitions and create equivalence between partitions
296  // that contain the same load. Also put partitions in between them in the
297  // same equivalence class to avoid reordering of memory operations.
298  for (PartitionContainerT::iterator I = PartitionContainer.begin(),
299  E = PartitionContainer.end();
300  I != E; ++I) {
301  auto *PartI = &*I;
302 
303  // If a load occurs in two partitions PartI and PartJ, merge all
304  // partitions (PartI, PartJ] into PartI.
305  for (Instruction *Inst : *PartI)
306  if (isa<LoadInst>(Inst)) {
307  bool NewElt;
308  LoadToPartitionT::iterator LoadToPart;
309 
310  std::tie(LoadToPart, NewElt) =
311  LoadToPartition.insert(std::make_pair(Inst, PartI));
312  if (!NewElt) {
313  DEBUG(dbgs() << "Merging partitions due to this load in multiple "
314  << "partitions: " << PartI << ", "
315  << LoadToPart->second << "\n" << *Inst << "\n");
316 
317  auto PartJ = I;
318  do {
319  --PartJ;
320  ToBeMerged.unionSets(PartI, &*PartJ);
321  } while (&*PartJ != LoadToPart->second);
322  }
323  }
324  }
325  if (ToBeMerged.empty())
326  return false;
327 
328  // Merge the member of an equivalence class into its class leader. This
329  // makes the members empty.
330  for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end();
331  I != E; ++I) {
332  if (!I->isLeader())
333  continue;
334 
335  auto PartI = I->getData();
336  for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)),
337  ToBeMerged.member_end())) {
338  PartJ->moveTo(*PartI);
339  }
340  }
341 
342  // Remove the empty partitions.
343  PartitionContainer.remove_if(
344  [](const InstPartition &P) { return P.empty(); });
345 
346  return true;
347  }
348 
349  /// \brief Sets up the mapping between instructions to partitions. If the
350  /// instruction is duplicated across multiple partitions, set the entry to -1.
351  void setupPartitionIdOnInstructions() {
352  int PartitionID = 0;
353  for (const auto &Partition : PartitionContainer) {
354  for (Instruction *Inst : Partition) {
355  bool NewElt;
356  InstToPartitionIdT::iterator Iter;
357 
358  std::tie(Iter, NewElt) =
359  InstToPartitionId.insert(std::make_pair(Inst, PartitionID));
360  if (!NewElt)
361  Iter->second = -1;
362  }
363  ++PartitionID;
364  }
365  }
366 
367  /// \brief Populates the partition with everything that the seeding
368  /// instructions require.
369  void populateUsedSet() {
370  for (auto &P : PartitionContainer)
371  P.populateUsedSet();
372  }
373 
374  /// \brief This performs the main chunk of the work of cloning the loops for
375  /// the partitions.
376  void cloneLoops(Pass *P) {
377  BasicBlock *OrigPH = L->getLoopPreheader();
378  // At this point the predecessor of the preheader is either the memcheck
379  // block or the top part of the original preheader.
380  BasicBlock *Pred = OrigPH->getSinglePredecessor();
381  assert(Pred && "Preheader does not have a single predecessor");
382  BasicBlock *ExitBlock = L->getExitBlock();
383  assert(ExitBlock && "No single exit block");
384  Loop *NewLoop;
385 
386  assert(!PartitionContainer.empty() && "at least two partitions expected");
387  // We're cloning the preheader along with the loop so we already made sure
388  // it was empty.
389  assert(&*OrigPH->begin() == OrigPH->getTerminator() &&
390  "preheader not empty");
391 
392  // Create a loop for each partition except the last. Clone the original
393  // loop before PH along with adding a preheader for the cloned loop. Then
394  // update PH to point to the newly added preheader.
395  BasicBlock *TopPH = OrigPH;
396  unsigned Index = getSize() - 1;
397  for (auto I = std::next(PartitionContainer.rbegin()),
398  E = PartitionContainer.rend();
399  I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) {
400  auto *Part = &*I;
401 
402  NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT);
403 
404  Part->getVMap()[ExitBlock] = TopPH;
405  Part->remapInstructions();
406  }
407  Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH);
408 
409  // Now go in forward order and update the immediate dominator for the
410  // preheaders with the exiting block of the previous loop. Dominance
411  // within the loop is updated in cloneLoopWithPreheader.
412  for (auto Curr = PartitionContainer.cbegin(),
413  Next = std::next(PartitionContainer.cbegin()),
414  E = PartitionContainer.cend();
415  Next != E; ++Curr, ++Next)
417  Next->getDistributedLoop()->getLoopPreheader(),
418  Curr->getDistributedLoop()->getExitingBlock());
419  }
420 
421  /// \brief Removes the dead instructions from the cloned loops.
422  void removeUnusedInsts() {
423  for (auto &Partition : PartitionContainer)
424  Partition.removeUnusedInsts();
425  }
426 
427  /// \brief For each memory pointer, it computes the partitionId the pointer is
428  /// used in.
429  ///
430  /// This returns an array of int where the I-th entry corresponds to I-th
431  /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple
432  /// partitions its entry is set to -1.
434  computePartitionSetForPointers(const LoopAccessInfo &LAI) {
435  const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking();
436 
437  unsigned N = RtPtrCheck->Pointers.size();
438  SmallVector<int, 8> PtrToPartitions(N);
439  for (unsigned I = 0; I < N; ++I) {
440  Value *Ptr = RtPtrCheck->Pointers[I].PointerValue;
441  auto Instructions =
442  LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr);
443 
444  int &Partition = PtrToPartitions[I];
445  // First set it to uninitialized.
446  Partition = -2;
447  for (Instruction *Inst : Instructions) {
448  // Note that this could be -1 if Inst is duplicated across multiple
449  // partitions.
450  int ThisPartition = this->InstToPartitionId[Inst];
451  if (Partition == -2)
452  Partition = ThisPartition;
453  // -1 means belonging to multiple partitions.
454  else if (Partition == -1)
455  break;
456  else if (Partition != (int)ThisPartition)
457  Partition = -1;
458  }
459  assert(Partition != -2 && "Pointer not belonging to any partition");
460  }
461 
462  return PtrToPartitions;
463  }
464 
465  void print(raw_ostream &OS) const {
466  unsigned Index = 0;
467  for (const auto &P : PartitionContainer) {
468  OS << "Partition " << Index++ << " (" << &P << "):\n";
469  P.print();
470  }
471  }
472 
473  void dump() const { print(dbgs()); }
474 
475 #ifndef NDEBUG
476  friend raw_ostream &operator<<(raw_ostream &OS,
477  const InstPartitionContainer &Partitions) {
478  Partitions.print(OS);
479  return OS;
480  }
481 #endif
482 
483  void printBlocks() const {
484  unsigned Index = 0;
485  for (const auto &P : PartitionContainer) {
486  dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n";
487  P.printBlocks();
488  }
489  }
490 
491 private:
492  typedef std::list<InstPartition> PartitionContainerT;
493 
494  /// \brief List of partitions.
495  PartitionContainerT PartitionContainer;
496 
497  /// \brief Mapping from Instruction to partition Id. If the instruction
498  /// belongs to multiple partitions the entry contains -1.
499  InstToPartitionIdT InstToPartitionId;
500 
501  Loop *L;
502  LoopInfo *LI;
503  DominatorTree *DT;
504 
505  /// \brief The control structure to merge adjacent partitions if both satisfy
506  /// the \p Predicate.
507  template <class UnaryPredicate>
508  void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) {
509  InstPartition *PrevMatch = nullptr;
510  for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) {
511  auto DoesMatch = Predicate(&*I);
512  if (PrevMatch == nullptr && DoesMatch) {
513  PrevMatch = &*I;
514  ++I;
515  } else if (PrevMatch != nullptr && DoesMatch) {
516  I->moveTo(*PrevMatch);
517  I = PartitionContainer.erase(I);
518  } else {
519  PrevMatch = nullptr;
520  ++I;
521  }
522  }
523  }
524 };
525 
526 /// \brief For each memory instruction, this class maintains difference of the
527 /// number of unsafe dependences that start out from this instruction minus
528 /// those that end here.
529 ///
530 /// By traversing the memory instructions in program order and accumulating this
531 /// number, we know whether any unsafe dependence crosses over a program point.
532 class MemoryInstructionDependences {
534 
535 public:
536  struct Entry {
537  Instruction *Inst;
538  unsigned NumUnsafeDependencesStartOrEnd;
539 
540  Entry(Instruction *Inst) : Inst(Inst), NumUnsafeDependencesStartOrEnd(0) {}
541  };
542 
543  typedef SmallVector<Entry, 8> AccessesType;
544 
545  AccessesType::const_iterator begin() const { return Accesses.begin(); }
546  AccessesType::const_iterator end() const { return Accesses.end(); }
547 
548  MemoryInstructionDependences(
549  const SmallVectorImpl<Instruction *> &Instructions,
550  const SmallVectorImpl<Dependence> &InterestingDependences) {
551  Accesses.append(Instructions.begin(), Instructions.end());
552 
553  DEBUG(dbgs() << "Backward dependences:\n");
554  for (auto &Dep : InterestingDependences)
555  if (Dep.isPossiblyBackward()) {
556  // Note that the designations source and destination follow the program
557  // order, i.e. source is always first. (The direction is given by the
558  // DepType.)
559  ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd;
560  --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd;
561 
562  DEBUG(Dep.print(dbgs(), 2, Instructions));
563  }
564  }
565 
566 private:
567  AccessesType Accesses;
568 };
569 
570 /// \brief Returns the instructions that use values defined in the loop.
571 static SmallVector<Instruction *, 8> findDefsUsedOutsideOfLoop(Loop *L) {
572  SmallVector<Instruction *, 8> UsedOutside;
573 
574  for (auto *Block : L->getBlocks())
575  // FIXME: I believe that this could use copy_if if the Inst reference could
576  // be adapted into a pointer.
577  for (auto &Inst : *Block) {
578  auto Users = Inst.users();
579  if (std::any_of(Users.begin(), Users.end(), [&](User *U) {
580  auto *Use = cast<Instruction>(U);
581  return !L->contains(Use->getParent());
582  }))
583  UsedOutside.push_back(&Inst);
584  }
585 
586  return UsedOutside;
587 }
588 
589 /// \brief The pass class.
590 class LoopDistribute : public FunctionPass {
591 public:
592  LoopDistribute() : FunctionPass(ID) {
594  }
595 
596  bool runOnFunction(Function &F) override {
597  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
598  LAA = &getAnalysis<LoopAccessAnalysis>();
599  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
600 
601  // Build up a worklist of inner-loops to vectorize. This is necessary as the
602  // act of distributing a loop creates new loops and can invalidate iterators
603  // across the loops.
604  SmallVector<Loop *, 8> Worklist;
605 
606  for (Loop *TopLevelLoop : *LI)
607  for (Loop *L : depth_first(TopLevelLoop))
608  // We only handle inner-most loops.
609  if (L->empty())
610  Worklist.push_back(L);
611 
612  // Now walk the identified inner loops.
613  bool Changed = false;
614  for (Loop *L : Worklist)
615  Changed |= processLoop(L);
616 
617  // Process each loop nest in the function.
618  return Changed;
619  }
620 
621  void getAnalysisUsage(AnalysisUsage &AU) const override {
627  }
628 
629  static char ID;
630 
631 private:
632  /// \brief Try to distribute an inner-most loop.
633  bool processLoop(Loop *L) {
634  assert(L->empty() && "Only process inner loops.");
635 
636  DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName()
637  << "\" checking " << *L << "\n");
638 
639  BasicBlock *PH = L->getLoopPreheader();
640  if (!PH) {
641  DEBUG(dbgs() << "Skipping; no preheader");
642  return false;
643  }
644  if (!L->getExitBlock()) {
645  DEBUG(dbgs() << "Skipping; multiple exit blocks");
646  return false;
647  }
648  // LAA will check that we only have a single exiting block.
649 
650  const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap());
651 
652  // Currently, we only distribute to isolate the part of the loop with
653  // dependence cycles to enable partial vectorization.
654  if (LAI.canVectorizeMemory()) {
655  DEBUG(dbgs() << "Skipping; memory operations are safe for vectorization");
656  return false;
657  }
658  auto *InterestingDependences =
660  if (!InterestingDependences || InterestingDependences->empty()) {
661  DEBUG(dbgs() << "Skipping; No unsafe dependences to isolate");
662  return false;
663  }
664 
665  InstPartitionContainer Partitions(L, LI, DT);
666 
667  // First, go through each memory operation and assign them to consecutive
668  // partitions (the order of partitions follows program order). Put those
669  // with unsafe dependences into "cyclic" partition otherwise put each store
670  // in its own "non-cyclic" partition (we'll merge these later).
671  //
672  // Note that a memory operation (e.g. Load2 below) at a program point that
673  // has an unsafe dependence (Store3->Load1) spanning over it must be
674  // included in the same cyclic partition as the dependent operations. This
675  // is to preserve the original program order after distribution. E.g.:
676  //
677  // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive
678  // Load1 -. 1 0->1
679  // Load2 | /Unsafe/ 0 1
680  // Store3 -' -1 1->0
681  // Load4 0 0
682  //
683  // NumUnsafeDependencesActive > 0 indicates this situation and in this case
684  // we just keep assigning to the same cyclic partition until
685  // NumUnsafeDependencesActive reaches 0.
686  const MemoryDepChecker &DepChecker = LAI.getDepChecker();
687  MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(),
688  *InterestingDependences);
689 
690  int NumUnsafeDependencesActive = 0;
691  for (auto &InstDep : MID) {
692  Instruction *I = InstDep.Inst;
693  // We update NumUnsafeDependencesActive post-instruction, catch the
694  // start of a dependence directly via NumUnsafeDependencesStartOrEnd.
695  if (NumUnsafeDependencesActive ||
696  InstDep.NumUnsafeDependencesStartOrEnd > 0)
697  Partitions.addToCyclicPartition(I);
698  else
699  Partitions.addToNewNonCyclicPartition(I);
700  NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd;
701  assert(NumUnsafeDependencesActive >= 0 &&
702  "Negative number of dependences active");
703  }
704 
705  // Add partitions for values used outside. These partitions can be out of
706  // order from the original program order. This is OK because if the
707  // partition uses a load we will merge this partition with the original
708  // partition of the load that we set up in the previous loop (see
709  // mergeToAvoidDuplicatedLoads).
710  auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L);
711  for (auto *Inst : DefsUsedOutside)
712  Partitions.addToNewNonCyclicPartition(Inst);
713 
714  DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
715  if (Partitions.getSize() < 2)
716  return false;
717 
718  // Run the merge heuristics: Merge non-cyclic adjacent partitions since we
719  // should be able to vectorize these together.
720  Partitions.mergeBeforePopulating();
721  DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
722  if (Partitions.getSize() < 2)
723  return false;
724 
725  // Now, populate the partitions with non-memory operations.
726  Partitions.populateUsedSet();
727  DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions);
728 
729  // In order to preserve original lexical order for loads, keep them in the
730  // partition that we set up in the MemoryInstructionDependences loop.
731  if (Partitions.mergeToAvoidDuplicatedLoads()) {
732  DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
733  << Partitions);
734  if (Partitions.getSize() < 2)
735  return false;
736  }
737 
738  DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
739  // We're done forming the partitions set up the reverse mapping from
740  // instructions to partitions.
741  Partitions.setupPartitionIdOnInstructions();
742 
743  // To keep things simple have an empty preheader before we version or clone
744  // the loop. (Also split if this has no predecessor, i.e. entry, because we
745  // rely on PH having a predecessor.)
746  if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator())
747  SplitBlock(PH, PH->getTerminator(), DT, LI);
748 
749  // If we need run-time checks to disambiguate pointers are run-time, version
750  // the loop now.
751  auto PtrToPartition = Partitions.computePartitionSetForPointers(LAI);
752  LoopVersioning LVer(LAI, L, LI, DT, &PtrToPartition);
753  if (LVer.needsRuntimeChecks()) {
754  DEBUG(dbgs() << "\nPointers:\n");
755  DEBUG(LAI.getRuntimePointerChecking()->print(dbgs(), 0, &PtrToPartition));
756  LVer.versionLoop(this);
757  LVer.addPHINodes(DefsUsedOutside);
758  }
759 
760  // Create identical copies of the original loop for each partition and hook
761  // them up sequentially.
762  Partitions.cloneLoops(this);
763 
764  // Now, we remove the instruction from each loop that don't belong to that
765  // partition.
766  Partitions.removeUnusedInsts();
767  DEBUG(dbgs() << "\nAfter removing unused Instrs:\n");
768  DEBUG(Partitions.printBlocks());
769 
770  if (LDistVerify) {
771  LI->verify();
772  DT->verifyDomTree();
773  }
774 
775  ++NumLoopsDistributed;
776  return true;
777  }
778 
779  // Analyses used.
780  LoopInfo *LI;
781  LoopAccessAnalysis *LAA;
782  DominatorTree *DT;
783 };
784 } // anonymous namespace
785 
786 char LoopDistribute::ID;
787 static const char ldist_name[] = "Loop Distribition";
788 
789 INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false)
793 INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false)
794 
795 namespace llvm {
796 FunctionPass *createLoopDistributePass() { return new LoopDistribute(); }
797 }
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:82
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:240
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
#define LDIST_NAME
STATISTIC(NumFunctions,"Total number of functions")
BasicBlock * SplitBlock(BasicBlock *Old, Instruction *SplitPt, DominatorTree *DT=nullptr, LoopInfo *LI=nullptr)
SplitBlock - Split the specified block at the specified instruction - every thing before SplitPt stay...
const SmallVectorImpl< Instruction * > & getMemoryInstructions() const
The vector of memory access instructions.
static cl::opt< bool > DistributeNonIfConvertible("loop-distribute-non-if-convertible", cl::Hidden, cl::desc("Whether to distribute into a loop that may not be ""if-convertible by the loop vectorizer"), cl::init(false))
const_iterator begin(StringRef path)
Get begin iterator over path.
Definition: Path.cpp:232
F(f)
Checks memory dependences among accesses to the same underlying object to determine whether there vec...
iv Induction Variable Users
Definition: IVUsers.cpp:43
BlockT * getExitBlock() const
getExitBlock - If getExitBlocks would return exactly one block, return that block.
Definition: LoopInfoImpl.h:78
const std::vector< BlockT * > & getBlocks() const
getBlocks - Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:139
BlockT * getHeader() const
Definition: LoopInfo.h:96
void remapInstructionsInBlocks(const SmallVectorImpl< BasicBlock * > &Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
DenseMap< const Value *, Value * > ValueToValueMap
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:188
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:231
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:70
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:79
A Use represents the edge between a Value definition and its users.
Definition: Use.h:69
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:75
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APInt.h:33
const SmallVectorImpl< Dependence > * getInterestingDependences() const
Returns the interesting dependences.
#define false
Definition: ConvertUTF.c:65
ELFYAML::ELF_STO Other
Definition: ELFYAML.cpp:591
FunctionPass * createLoopDistributePass()
void print(raw_ostream &OS, unsigned Depth=0, const SmallVectorImpl< int > *PtrPartition=nullptr) const
Print the list run-time memory checks necessary.
const RuntimePointerChecking * getRuntimePointerChecking() const
static bool add(uint64_t *dest, const uint64_t *x, const uint64_t *y, unsigned len)
This function adds the integer array x to the integer array Y and places the result in dest...
Definition: APInt.cpp:238
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:67
void replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:24
#define P(N)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:325
friend const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:240
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
BlockT * getLoopPreheader() const
getLoopPreheader - If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:108
LLVM Basic Block Representation.
Definition: BasicBlock.h:65
SmallVector< Instruction *, 4 > getInstructionsForAccess(Value *Ptr, bool isWrite) const
Return the list of instructions that use Ptr to read or write memory.
EquivalenceClasses - This represents a collection of equivalence classes and supports three efficient...
Represent the analysis usage information of a pass.
bool contains(const LoopT *L) const
contains - Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:105
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:294
void initializeLoopDistributePass(PassRegistry &)
static UndefValue * get(Type *T)
get() - Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1473
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:299
friend const_iterator begin(StringRef path)
Get begin iterator over path.
Definition: Path.cpp:232
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:861
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:67
const MemoryDepChecker & getDepChecker() const
the Memory Dependence Checker which can determine the loop-independent and loop-carried dependences b...
Drive the analysis of memory accesses in the loop.
This class emits a version of the loop where run-time checks ensure that may-alias pointers can't ove...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:123
iterator_range< user_iterator > users()
Definition: Value.h:300
BasicBlock * getSinglePredecessor()
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:211
static const char ldist_name[]
LLVM_ATTRIBUTE_UNUSED_RESULT std::enable_if< !is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:285
Loop * cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB, Loop *OrigLoop, ValueToValueMapTy &VMap, const Twine &NameSuffix, LoopInfo *LI, DominatorTree *DT, SmallVectorImpl< BasicBlock * > &Blocks)
Clones a loop OrigLoop.
Holds information about the memory runtime legality checks to verify that a group of pointers do not ...
This analysis provides dependence information for the memory accesses of a loop.
Dependece between memory access instructions.
#define I(x, y, z)
Definition: MD5.cpp:54
#define N
TerminatorInst * getTerminator()
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:124
SmallVector< PointerInfo, 2 > Pointers
Information about the pointers that may require checking.
iterator_range< value_op_iterator > operand_values()
Definition: User.h:215
raw_ostream & operator<<(raw_ostream &OS, const APInt &I)
Definition: APInt.h:1738
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
iterator_range< df_iterator< T > > depth_first(const T &G)
static cl::opt< bool > LDistVerify("loop-distribute-verify", cl::Hidden, cl::desc("Turn on DominatorTree and LoopInfo verification ""after Loop Distribution"), cl::init(false))
LLVM Value Representation.
Definition: Value.h:69
static bool blockNeedsPredication(BasicBlock *BB, Loop *TheLoop, DominatorTree *DT)
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
bool canVectorizeMemory() const
Return true we can analyze the memory accesses in the loop and there are no memory dependence cycles...
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:38
bool empty() const
Definition: LoopInfo.h:135
#define DEBUG(X)
Definition: Debug.h:92
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:737
virtual void print(raw_ostream &O, const Module *M) const
print - Print out the internal state of the pass.
Definition: Pass.cpp:111
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:203
Dependence - This class represents a dependence between two memory memory references in a function...
const BasicBlock * getParent() const
Definition: Instruction.h:72