LLVM  10.0.0svn
LoopUnrollPass.cpp
Go to the documentation of this file.
1 //===- LoopUnroll.cpp - Loop unroller pass --------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass implements a simple loop unroller. It works best when loops have
10 // been canonicalized by the -indvars pass, allowing it to determine the trip
11 // counts of loops easily.
12 //===----------------------------------------------------------------------===//
13 
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/DenseMapInfo.h"
17 #include "llvm/ADT/DenseSet.h"
18 #include "llvm/ADT/None.h"
19 #include "llvm/ADT/Optional.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/SetVector.h"
22 #include "llvm/ADT/SmallPtrSet.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/ADT/StringRef.h"
30 #include "llvm/Analysis/LoopInfo.h"
31 #include "llvm/Analysis/LoopPass.h"
37 #include "llvm/IR/BasicBlock.h"
38 #include "llvm/IR/CFG.h"
39 #include "llvm/IR/Constant.h"
40 #include "llvm/IR/Constants.h"
41 #include "llvm/IR/DiagnosticInfo.h"
42 #include "llvm/IR/Dominators.h"
43 #include "llvm/IR/Function.h"
44 #include "llvm/IR/Instruction.h"
45 #include "llvm/IR/Instructions.h"
46 #include "llvm/IR/IntrinsicInst.h"
47 #include "llvm/IR/Metadata.h"
48 #include "llvm/IR/PassManager.h"
49 #include "llvm/Pass.h"
50 #include "llvm/Support/Casting.h"
52 #include "llvm/Support/Debug.h"
55 #include "llvm/Transforms/Scalar.h"
57 #include "llvm/Transforms/Utils.h"
62 #include <algorithm>
63 #include <cassert>
64 #include <cstdint>
65 #include <limits>
66 #include <string>
67 #include <tuple>
68 #include <utility>
69 
70 using namespace llvm;
71 
72 #define DEBUG_TYPE "loop-unroll"
73 
75  "forget-scev-loop-unroll", cl::init(false), cl::Hidden,
76  cl::desc("Forget everything in SCEV when doing LoopUnroll, instead of just"
77  " the current top-most loop. This is somtimes preferred to reduce"
78  " compile time."));
79 
80 static cl::opt<unsigned>
81  UnrollThreshold("unroll-threshold", cl::Hidden,
82  cl::desc("The cost threshold for loop unrolling"));
83 
85  "unroll-partial-threshold", cl::Hidden,
86  cl::desc("The cost threshold for partial loop unrolling"));
87 
89  "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden,
90  cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied "
91  "to the threshold when aggressively unrolling a loop due to the "
92  "dynamic cost savings. If completely unrolling a loop will reduce "
93  "the total runtime from X to Y, we boost the loop unroll "
94  "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, "
95  "X/Y). This limit avoids excessive code bloat."));
96 
98  "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden,
99  cl::desc("Don't allow loop unrolling to simulate more than this number of"
100  "iterations when checking full unroll profitability"));
101 
103  "unroll-count", cl::Hidden,
104  cl::desc("Use this unroll count for all loops including those with "
105  "unroll_count pragma values, for testing purposes"));
106 
108  "unroll-max-count", cl::Hidden,
109  cl::desc("Set the max unroll count for partial and runtime unrolling, for"
110  "testing purposes"));
111 
113  "unroll-full-max-count", cl::Hidden,
114  cl::desc(
115  "Set the max unroll count for full unrolling, for testing purposes"));
116 
118  "unroll-peel-count", cl::Hidden,
119  cl::desc("Set the unroll peeling count, for testing purposes"));
120 
121 static cl::opt<bool>
122  UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
123  cl::desc("Allows loops to be partially unrolled until "
124  "-unroll-threshold loop size is reached."));
125 
127  "unroll-allow-remainder", cl::Hidden,
128  cl::desc("Allow generation of a loop remainder (extra iterations) "
129  "when unrolling a loop."));
130 
131 static cl::opt<bool>
132  UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden,
133  cl::desc("Unroll loops with run-time trip counts"));
134 
136  "unroll-max-upperbound", cl::init(8), cl::Hidden,
137  cl::desc(
138  "The max of trip count upper bound that is considered in unrolling"));
139 
141  "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
142  cl::desc("Unrolled size limit for loops with an unroll(full) or "
143  "unroll_count pragma."));
144 
146  "flat-loop-tripcount-threshold", cl::init(5), cl::Hidden,
147  cl::desc("If the runtime tripcount for the loop is lower than the "
148  "threshold, the loop is considered as flat and will be less "
149  "aggressively unrolled."));
150 
151 static cl::opt<bool>
152  UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden,
153  cl::desc("Allows loops to be peeled when the dynamic "
154  "trip count is known to be low."));
155 
157  "unroll-remainder", cl::Hidden,
158  cl::desc("Allow the loop remainder to be unrolled."));
159 
160 // This option isn't ever intended to be enabled, it serves to allow
161 // experiments to check the assumptions about when this kind of revisit is
162 // necessary.
164  "unroll-revisit-child-loops", cl::Hidden,
165  cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. "
166  "This shouldn't typically be needed as child loops (or their "
167  "clones) were already visited."));
168 
169 /// A magic value for use with the Threshold parameter to indicate
170 /// that the loop unroll should be performed regardless of how much
171 /// code expansion would result.
173 
174 /// Gather the various unrolling parameters based on the defaults, compiler
175 /// flags, TTI overrides and user specified parameters.
177  Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
178  BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel,
179  Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
180  Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
181  Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling,
182  Optional<bool> UserAllowProfileBasedPeeling,
183  Optional<unsigned> UserFullUnrollMaxCount) {
185 
186  // Set up the defaults
187  UP.Threshold = OptLevel > 2 ? 300 : 150;
188  UP.MaxPercentThresholdBoost = 400;
189  UP.OptSizeThreshold = 0;
190  UP.PartialThreshold = 150;
192  UP.Count = 0;
193  UP.PeelCount = 0;
197  UP.BEInsns = 2;
198  UP.Partial = false;
199  UP.Runtime = false;
200  UP.AllowRemainder = true;
201  UP.UnrollRemainder = false;
202  UP.AllowExpensiveTripCount = false;
203  UP.Force = false;
204  UP.UpperBound = false;
205  UP.AllowPeeling = true;
206  UP.UnrollAndJam = false;
207  UP.PeelProfiledIterations = true;
209 
210  // Override with any target specific settings
211  TTI.getUnrollingPreferences(L, SE, UP);
212 
213  // Apply size attributes
214  bool OptForSize = L->getHeader()->getParent()->hasOptSize() ||
216  if (OptForSize) {
217  UP.Threshold = UP.OptSizeThreshold;
219  UP.MaxPercentThresholdBoost = 100;
220  }
221 
222  // Apply any user values specified by cl::opt
223  if (UnrollThreshold.getNumOccurrences() > 0)
225  if (UnrollPartialThreshold.getNumOccurrences() > 0)
227  if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)
229  if (UnrollMaxCount.getNumOccurrences() > 0)
231  if (UnrollFullMaxCount.getNumOccurrences() > 0)
233  if (UnrollPeelCount.getNumOccurrences() > 0)
235  if (UnrollAllowPartial.getNumOccurrences() > 0)
237  if (UnrollAllowRemainder.getNumOccurrences() > 0)
239  if (UnrollRuntime.getNumOccurrences() > 0)
240  UP.Runtime = UnrollRuntime;
241  if (UnrollMaxUpperBound == 0)
242  UP.UpperBound = false;
243  if (UnrollAllowPeeling.getNumOccurrences() > 0)
245  if (UnrollUnrollRemainder.getNumOccurrences() > 0)
247 
248  // Apply user values provided by argument
249  if (UserThreshold.hasValue()) {
250  UP.Threshold = *UserThreshold;
251  UP.PartialThreshold = *UserThreshold;
252  }
253  if (UserCount.hasValue())
254  UP.Count = *UserCount;
255  if (UserAllowPartial.hasValue())
256  UP.Partial = *UserAllowPartial;
257  if (UserRuntime.hasValue())
258  UP.Runtime = *UserRuntime;
259  if (UserUpperBound.hasValue())
260  UP.UpperBound = *UserUpperBound;
261  if (UserAllowPeeling.hasValue())
262  UP.AllowPeeling = *UserAllowPeeling;
263  if (UserAllowProfileBasedPeeling.hasValue())
264  UP.PeelProfiledIterations = *UserAllowProfileBasedPeeling;
265  if (UserFullUnrollMaxCount.hasValue())
266  UP.FullUnrollMaxCount = *UserFullUnrollMaxCount;
267 
268  return UP;
269 }
270 
271 namespace {
272 
273 /// A struct to densely store the state of an instruction after unrolling at
274 /// each iteration.
275 ///
276 /// This is designed to work like a tuple of <Instruction *, int> for the
277 /// purposes of hashing and lookup, but to be able to associate two boolean
278 /// states with each key.
279 struct UnrolledInstState {
280  Instruction *I;
281  int Iteration : 30;
282  unsigned IsFree : 1;
283  unsigned IsCounted : 1;
284 };
285 
286 /// Hashing and equality testing for a set of the instruction states.
287 struct UnrolledInstStateKeyInfo {
288  using PtrInfo = DenseMapInfo<Instruction *>;
290 
291  static inline UnrolledInstState getEmptyKey() {
292  return {PtrInfo::getEmptyKey(), 0, 0, 0};
293  }
294 
295  static inline UnrolledInstState getTombstoneKey() {
296  return {PtrInfo::getTombstoneKey(), 0, 0, 0};
297  }
298 
299  static inline unsigned getHashValue(const UnrolledInstState &S) {
300  return PairInfo::getHashValue({S.I, S.Iteration});
301  }
302 
303  static inline bool isEqual(const UnrolledInstState &LHS,
304  const UnrolledInstState &RHS) {
305  return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration});
306  }
307 };
308 
309 struct EstimatedUnrollCost {
310  /// The estimated cost after unrolling.
311  unsigned UnrolledCost;
312 
313  /// The estimated dynamic cost of executing the instructions in the
314  /// rolled form.
315  unsigned RolledDynamicCost;
316 };
317 
318 } // end anonymous namespace
319 
320 /// Figure out if the loop is worth full unrolling.
321 ///
322 /// Complete loop unrolling can make some loads constant, and we need to know
323 /// if that would expose any further optimization opportunities. This routine
324 /// estimates this optimization. It computes cost of unrolled loop
325 /// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By
326 /// dynamic cost we mean that we won't count costs of blocks that are known not
327 /// to be executed (i.e. if we have a branch in the loop and we know that at the
328 /// given iteration its condition would be resolved to true, we won't add up the
329 /// cost of the 'false'-block).
330 /// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
331 /// the analysis failed (no benefits expected from the unrolling, or the loop is
332 /// too big to analyze), the returned value is None.
334  const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE,
335  const SmallPtrSetImpl<const Value *> &EphValues,
336  const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize) {
337  // We want to be able to scale offsets by the trip count and add more offsets
338  // to them without checking for overflows, and we already don't want to
339  // analyze *massive* trip counts, so we force the max to be reasonably small.
341  (unsigned)(std::numeric_limits<int>::max() / 2) &&
342  "The unroll iterations max is too large!");
343 
344  // Only analyze inner loops. We can't properly estimate cost of nested loops
345  // and we won't visit inner loops again anyway.
346  if (!L->empty())
347  return None;
348 
349  // Don't simulate loops with a big or unknown tripcount
350  if (!UnrollMaxIterationsCountToAnalyze || !TripCount ||
352  return None;
353 
356  DenseMap<Value *, Constant *> SimplifiedValues;
357  SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues;
358 
359  // The estimated cost of the unrolled form of the loop. We try to estimate
360  // this by simplifying as much as we can while computing the estimate.
361  unsigned UnrolledCost = 0;
362 
363  // We also track the estimated dynamic (that is, actually executed) cost in
364  // the rolled form. This helps identify cases when the savings from unrolling
365  // aren't just exposing dead control flows, but actual reduced dynamic
366  // instructions due to the simplifications which we expect to occur after
367  // unrolling.
368  unsigned RolledDynamicCost = 0;
369 
370  // We track the simplification of each instruction in each iteration. We use
371  // this to recursively merge costs into the unrolled cost on-demand so that
372  // we don't count the cost of any dead code. This is essentially a map from
373  // <instruction, int> to <bool, bool>, but stored as a densely packed struct.
375 
376  // A small worklist used to accumulate cost of instructions from each
377  // observable and reached root in the loop.
378  SmallVector<Instruction *, 16> CostWorklist;
379 
380  // PHI-used worklist used between iterations while accumulating cost.
381  SmallVector<Instruction *, 4> PHIUsedList;
382 
383  // Helper function to accumulate cost for instructions in the loop.
384  auto AddCostRecursively = [&](Instruction &RootI, int Iteration) {
385  assert(Iteration >= 0 && "Cannot have a negative iteration!");
386  assert(CostWorklist.empty() && "Must start with an empty cost list");
387  assert(PHIUsedList.empty() && "Must start with an empty phi used list");
388  CostWorklist.push_back(&RootI);
389  for (;; --Iteration) {
390  do {
391  Instruction *I = CostWorklist.pop_back_val();
392 
393  // InstCostMap only uses I and Iteration as a key, the other two values
394  // don't matter here.
395  auto CostIter = InstCostMap.find({I, Iteration, 0, 0});
396  if (CostIter == InstCostMap.end())
397  // If an input to a PHI node comes from a dead path through the loop
398  // we may have no cost data for it here. What that actually means is
399  // that it is free.
400  continue;
401  auto &Cost = *CostIter;
402  if (Cost.IsCounted)
403  // Already counted this instruction.
404  continue;
405 
406  // Mark that we are counting the cost of this instruction now.
407  Cost.IsCounted = true;
408 
409  // If this is a PHI node in the loop header, just add it to the PHI set.
410  if (auto *PhiI = dyn_cast<PHINode>(I))
411  if (PhiI->getParent() == L->getHeader()) {
412  assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
413  "inherently simplify during unrolling.");
414  if (Iteration == 0)
415  continue;
416 
417  // Push the incoming value from the backedge into the PHI used list
418  // if it is an in-loop instruction. We'll use this to populate the
419  // cost worklist for the next iteration (as we count backwards).
420  if (auto *OpI = dyn_cast<Instruction>(
421  PhiI->getIncomingValueForBlock(L->getLoopLatch())))
422  if (L->contains(OpI))
423  PHIUsedList.push_back(OpI);
424  continue;
425  }
426 
427  // First accumulate the cost of this instruction.
428  if (!Cost.IsFree) {
429  UnrolledCost += TTI.getUserCost(I);
430  LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration "
431  << Iteration << "): ");
432  LLVM_DEBUG(I->dump());
433  }
434 
435  // We must count the cost of every operand which is not free,
436  // recursively. If we reach a loop PHI node, simply add it to the set
437  // to be considered on the next iteration (backwards!).
438  for (Value *Op : I->operands()) {
439  // Check whether this operand is free due to being a constant or
440  // outside the loop.
441  auto *OpI = dyn_cast<Instruction>(Op);
442  if (!OpI || !L->contains(OpI))
443  continue;
444 
445  // Otherwise accumulate its cost.
446  CostWorklist.push_back(OpI);
447  }
448  } while (!CostWorklist.empty());
449 
450  if (PHIUsedList.empty())
451  // We've exhausted the search.
452  break;
453 
454  assert(Iteration > 0 &&
455  "Cannot track PHI-used values past the first iteration!");
456  CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end());
457  PHIUsedList.clear();
458  }
459  };
460 
461  // Ensure that we don't violate the loop structure invariants relied on by
462  // this analysis.
463  assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
464  assert(L->isLCSSAForm(DT) &&
465  "Must have loops in LCSSA form to track live-out values.");
466 
467  LLVM_DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
468 
469  // Simulate execution of each iteration of the loop counting instructions,
470  // which would be simplified.
471  // Since the same load will take different values on different iterations,
472  // we literally have to go through all loop's iterations.
473  for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
474  LLVM_DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
475 
476  // Prepare for the iteration by collecting any simplified entry or backedge
477  // inputs.
478  for (Instruction &I : *L->getHeader()) {
479  auto *PHI = dyn_cast<PHINode>(&I);
480  if (!PHI)
481  break;
482 
483  // The loop header PHI nodes must have exactly two input: one from the
484  // loop preheader and one from the loop latch.
485  assert(
486  PHI->getNumIncomingValues() == 2 &&
487  "Must have an incoming value only for the preheader and the latch.");
488 
489  Value *V = PHI->getIncomingValueForBlock(
490  Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
491  Constant *C = dyn_cast<Constant>(V);
492  if (Iteration != 0 && !C)
493  C = SimplifiedValues.lookup(V);
494  if (C)
495  SimplifiedInputValues.push_back({PHI, C});
496  }
497 
498  // Now clear and re-populate the map for the next iteration.
499  SimplifiedValues.clear();
500  while (!SimplifiedInputValues.empty())
501  SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());
502 
503  UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L);
504 
505  BBWorklist.clear();
506  BBWorklist.insert(L->getHeader());
507  // Note that we *must not* cache the size, this loop grows the worklist.
508  for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
509  BasicBlock *BB = BBWorklist[Idx];
510 
511  // Visit all instructions in the given basic block and try to simplify
512  // it. We don't change the actual IR, just count optimization
513  // opportunities.
514  for (Instruction &I : *BB) {
515  // These won't get into the final code - don't even try calculating the
516  // cost for them.
517  if (isa<DbgInfoIntrinsic>(I) || EphValues.count(&I))
518  continue;
519 
520  // Track this instruction's expected baseline cost when executing the
521  // rolled loop form.
522  RolledDynamicCost += TTI.getUserCost(&I);
523 
524  // Visit the instruction to analyze its loop cost after unrolling,
525  // and if the visitor returns true, mark the instruction as free after
526  // unrolling and continue.
527  bool IsFree = Analyzer.visit(I);
528  bool Inserted = InstCostMap.insert({&I, (int)Iteration,
529  (unsigned)IsFree,
530  /*IsCounted*/ false}).second;
531  (void)Inserted;
532  assert(Inserted && "Cannot have a state for an unvisited instruction!");
533 
534  if (IsFree)
535  continue;
536 
537  // Can't properly model a cost of a call.
538  // FIXME: With a proper cost model we should be able to do it.
539  if (auto *CI = dyn_cast<CallInst>(&I)) {
540  const Function *Callee = CI->getCalledFunction();
541  if (!Callee || TTI.isLoweredToCall(Callee)) {
542  LLVM_DEBUG(dbgs() << "Can't analyze cost of loop with call\n");
543  return None;
544  }
545  }
546 
547  // If the instruction might have a side-effect recursively account for
548  // the cost of it and all the instructions leading up to it.
549  if (I.mayHaveSideEffects())
550  AddCostRecursively(I, Iteration);
551 
552  // If unrolled body turns out to be too big, bail out.
553  if (UnrolledCost > MaxUnrolledLoopSize) {
554  LLVM_DEBUG(dbgs() << " Exceeded threshold.. exiting.\n"
555  << " UnrolledCost: " << UnrolledCost
556  << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
557  << "\n");
558  return None;
559  }
560  }
561 
562  Instruction *TI = BB->getTerminator();
563 
564  // Add in the live successors by first checking whether we have terminator
565  // that may be simplified based on the values simplified by this call.
566  BasicBlock *KnownSucc = nullptr;
567  if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
568  if (BI->isConditional()) {
569  if (Constant *SimpleCond =
570  SimplifiedValues.lookup(BI->getCondition())) {
571  // Just take the first successor if condition is undef
572  if (isa<UndefValue>(SimpleCond))
573  KnownSucc = BI->getSuccessor(0);
574  else if (ConstantInt *SimpleCondVal =
575  dyn_cast<ConstantInt>(SimpleCond))
576  KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0);
577  }
578  }
579  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
580  if (Constant *SimpleCond =
581  SimplifiedValues.lookup(SI->getCondition())) {
582  // Just take the first successor if condition is undef
583  if (isa<UndefValue>(SimpleCond))
584  KnownSucc = SI->getSuccessor(0);
585  else if (ConstantInt *SimpleCondVal =
586  dyn_cast<ConstantInt>(SimpleCond))
587  KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor();
588  }
589  }
590  if (KnownSucc) {
591  if (L->contains(KnownSucc))
592  BBWorklist.insert(KnownSucc);
593  else
594  ExitWorklist.insert({BB, KnownSucc});
595  continue;
596  }
597 
598  // Add BB's successors to the worklist.
599  for (BasicBlock *Succ : successors(BB))
600  if (L->contains(Succ))
601  BBWorklist.insert(Succ);
602  else
603  ExitWorklist.insert({BB, Succ});
604  AddCostRecursively(*TI, Iteration);
605  }
606 
607  // If we found no optimization opportunities on the first iteration, we
608  // won't find them on later ones too.
609  if (UnrolledCost == RolledDynamicCost) {
610  LLVM_DEBUG(dbgs() << " No opportunities found.. exiting.\n"
611  << " UnrolledCost: " << UnrolledCost << "\n");
612  return None;
613  }
614  }
615 
616  while (!ExitWorklist.empty()) {
617  BasicBlock *ExitingBB, *ExitBB;
618  std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val();
619 
620  for (Instruction &I : *ExitBB) {
621  auto *PN = dyn_cast<PHINode>(&I);
622  if (!PN)
623  break;
624 
625  Value *Op = PN->getIncomingValueForBlock(ExitingBB);
626  if (auto *OpI = dyn_cast<Instruction>(Op))
627  if (L->contains(OpI))
628  AddCostRecursively(*OpI, TripCount - 1);
629  }
630  }
631 
632  LLVM_DEBUG(dbgs() << "Analysis finished:\n"
633  << "UnrolledCost: " << UnrolledCost << ", "
634  << "RolledDynamicCost: " << RolledDynamicCost << "\n");
635  return {{UnrolledCost, RolledDynamicCost}};
636 }
637 
638 /// ApproximateLoopSize - Approximate the size of the loop.
640  const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent,
641  const TargetTransformInfo &TTI,
642  const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) {
644  for (BasicBlock *BB : L->blocks())
645  Metrics.analyzeBasicBlock(BB, TTI, EphValues);
646  NumCalls = Metrics.NumInlineCandidates;
647  NotDuplicatable = Metrics.notDuplicatable;
648  Convergent = Metrics.convergent;
649 
650  unsigned LoopSize = Metrics.NumInsts;
651 
652  // Don't allow an estimate of size zero. This would allows unrolling of loops
653  // with huge iteration counts, which is a compile time problem even if it's
654  // not a problem for code quality. Also, the code using this size may assume
655  // that each loop has at least three instructions (likely a conditional
656  // branch, a comparison feeding that branch, and some kind of loop increment
657  // feeding that comparison instruction).
658  LoopSize = std::max(LoopSize, BEInsns + 1);
659 
660  return LoopSize;
661 }
662 
663 // Returns the loop hint metadata node with the given name (for example,
664 // "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is
665 // returned.
667  if (MDNode *LoopID = L->getLoopID())
668  return GetUnrollMetadata(LoopID, Name);
669  return nullptr;
670 }
671 
672 // Returns true if the loop has an unroll(full) pragma.
673 static bool HasUnrollFullPragma(const Loop *L) {
674  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.full");
675 }
676 
677 // Returns true if the loop has an unroll(enable) pragma. This metadata is used
678 // for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives.
679 static bool HasUnrollEnablePragma(const Loop *L) {
680  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.enable");
681 }
682 
683 // Returns true if the loop has an runtime unroll(disable) pragma.
684 static bool HasRuntimeUnrollDisablePragma(const Loop *L) {
685  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");
686 }
687 
688 // If loop has an unroll_count pragma return the (necessarily
689 // positive) value from the pragma. Otherwise return 0.
690 static unsigned UnrollCountPragmaValue(const Loop *L) {
691  MDNode *MD = GetUnrollMetadataForLoop(L, "llvm.loop.unroll.count");
692  if (MD) {
693  assert(MD->getNumOperands() == 2 &&
694  "Unroll count hint metadata should have two operands.");
695  unsigned Count =
696  mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
697  assert(Count >= 1 && "Unroll count must be positive.");
698  return Count;
699  }
700  return 0;
701 }
702 
703 // Computes the boosting factor for complete unrolling.
704 // If fully unrolling the loop would save a lot of RolledDynamicCost, it would
705 // be beneficial to fully unroll the loop even if unrolledcost is large. We
706 // use (RolledDynamicCost / UnrolledCost) to model the unroll benefits to adjust
707 // the unroll threshold.
708 static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost,
709  unsigned MaxPercentThresholdBoost) {
710  if (Cost.RolledDynamicCost >= std::numeric_limits<unsigned>::max() / 100)
711  return 100;
712  else if (Cost.UnrolledCost != 0)
713  // The boosting factor is RolledDynamicCost / UnrolledCost
714  return std::min(100 * Cost.RolledDynamicCost / Cost.UnrolledCost,
715  MaxPercentThresholdBoost);
716  else
717  return MaxPercentThresholdBoost;
718 }
719 
720 // Returns loop size estimation for unrolled loop.
721 static uint64_t getUnrolledLoopSize(
722  unsigned LoopSize,
724  assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
725  return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
726 }
727 
728 // Returns true if unroll count was set explicitly.
729 // Calculates unroll count and writes it to UP.Count.
730 // Unless IgnoreUser is true, will also use metadata and command-line options
731 // that are specific to to the LoopUnroll pass (which, for instance, are
732 // irrelevant for the LoopUnrollAndJam pass).
733 // FIXME: This function is used by LoopUnroll and LoopUnrollAndJam, but consumes
734 // many LoopUnroll-specific options. The shared functionality should be
735 // refactored into it own function.
737  Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
738  ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
739  OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount,
740  bool MaxOrZero, unsigned &TripMultiple, unsigned LoopSize,
741  TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
742 
743  // Check for explicit Count.
744  // 1st priority is unroll count set by "unroll-count" option.
745  bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
746  if (UserUnrollCount) {
747  UP.Count = UnrollCount;
748  UP.AllowExpensiveTripCount = true;
749  UP.Force = true;
750  if (UP.AllowRemainder && getUnrolledLoopSize(LoopSize, UP) < UP.Threshold)
751  return true;
752  }
753 
754  // 2nd priority is unroll count set by pragma.
755  unsigned PragmaCount = UnrollCountPragmaValue(L);
756  if (PragmaCount > 0) {
757  UP.Count = PragmaCount;
758  UP.Runtime = true;
759  UP.AllowExpensiveTripCount = true;
760  UP.Force = true;
761  if ((UP.AllowRemainder || (TripMultiple % PragmaCount == 0)) &&
763  return true;
764  }
765  bool PragmaFullUnroll = HasUnrollFullPragma(L);
766  if (PragmaFullUnroll && TripCount != 0) {
767  UP.Count = TripCount;
768  if (getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
769  return false;
770  }
771 
772  bool PragmaEnableUnroll = HasUnrollEnablePragma(L);
773  bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
774  PragmaEnableUnroll || UserUnrollCount;
775 
776  if (ExplicitUnroll && TripCount != 0) {
777  // If the loop has an unrolling pragma, we want to be more aggressive with
778  // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
779  // value which is larger than the default limits.
780  UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
781  UP.PartialThreshold =
782  std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
783  }
784 
785  // 3rd priority is full unroll count.
786  // Full unroll makes sense only when TripCount or its upper bound could be
787  // statically calculated.
788  // Also we need to check if we exceed FullUnrollMaxCount.
789  // If using the upper bound to unroll, TripMultiple should be set to 1 because
790  // we do not know when loop may exit.
791 
792  // We can unroll by the upper bound amount if it's generally allowed or if
793  // we know that the loop is executed either the upper bound or zero times.
794  // (MaxOrZero unrolling keeps only the first loop test, so the number of
795  // loop tests remains the same compared to the non-unrolled version, whereas
796  // the generic upper bound unrolling keeps all but the last loop test so the
797  // number of loop tests goes up which may end up being worse on targets with
798  // constrained branch predictor resources so is controlled by an option.)
799  // In addition we only unroll small upper bounds.
800  unsigned FullUnrollMaxTripCount = MaxTripCount;
801  if (!(UP.UpperBound || MaxOrZero) ||
802  FullUnrollMaxTripCount > UnrollMaxUpperBound)
803  FullUnrollMaxTripCount = 0;
804 
805  // UnrollByMaxCount and ExactTripCount cannot both be non zero since we only
806  // compute the former when the latter is zero.
807  unsigned ExactTripCount = TripCount;
808  assert((ExactTripCount == 0 || FullUnrollMaxTripCount == 0) &&
809  "ExtractTripCount and UnrollByMaxCount cannot both be non zero.");
810 
811  unsigned FullUnrollTripCount =
812  ExactTripCount ? ExactTripCount : FullUnrollMaxTripCount;
813  UP.Count = FullUnrollTripCount;
814  if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
815  // When computing the unrolled size, note that BEInsns are not replicated
816  // like the rest of the loop body.
817  if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) {
818  UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
819  TripCount = FullUnrollTripCount;
820  TripMultiple = UP.UpperBound ? 1 : TripMultiple;
821  return ExplicitUnroll;
822  } else {
823  // The loop isn't that small, but we still can fully unroll it if that
824  // helps to remove a significant number of instructions.
825  // To check that, run additional analysis on the loop.
827  L, FullUnrollTripCount, DT, SE, EphValues, TTI,
828  UP.Threshold * UP.MaxPercentThresholdBoost / 100)) {
829  unsigned Boost =
831  if (Cost->UnrolledCost < UP.Threshold * Boost / 100) {
832  UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
833  TripCount = FullUnrollTripCount;
834  TripMultiple = UP.UpperBound ? 1 : TripMultiple;
835  return ExplicitUnroll;
836  }
837  }
838  }
839  }
840 
841  // 4th priority is loop peeling.
842  computePeelCount(L, LoopSize, UP, TripCount, SE);
843  if (UP.PeelCount) {
844  UP.Runtime = false;
845  UP.Count = 1;
846  return ExplicitUnroll;
847  }
848 
849  // 5th priority is partial unrolling.
850  // Try partial unroll only when TripCount could be statically calculated.
851  if (TripCount) {
852  UP.Partial |= ExplicitUnroll;
853  if (!UP.Partial) {
854  LLVM_DEBUG(dbgs() << " will not try to unroll partially because "
855  << "-unroll-allow-partial not given\n");
856  UP.Count = 0;
857  return false;
858  }
859  if (UP.Count == 0)
860  UP.Count = TripCount;
861  if (UP.PartialThreshold != NoThreshold) {
862  // Reduce unroll count to be modulo of TripCount for partial unrolling.
863  if (getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
864  UP.Count =
865  (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
866  (LoopSize - UP.BEInsns);
867  if (UP.Count > UP.MaxCount)
868  UP.Count = UP.MaxCount;
869  while (UP.Count != 0 && TripCount % UP.Count != 0)
870  UP.Count--;
871  if (UP.AllowRemainder && UP.Count <= 1) {
872  // If there is no Count that is modulo of TripCount, set Count to
873  // largest power-of-two factor that satisfies the threshold limit.
874  // As we'll create fixup loop, do the type of unrolling only if
875  // remainder loop is allowed.
877  while (UP.Count != 0 &&
878  getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
879  UP.Count >>= 1;
880  }
881  if (UP.Count < 2) {
882  if (PragmaEnableUnroll)
883  ORE->emit([&]() {
885  "UnrollAsDirectedTooLarge",
886  L->getStartLoc(), L->getHeader())
887  << "Unable to unroll loop as directed by unroll(enable) "
888  "pragma "
889  "because unrolled size is too large.";
890  });
891  UP.Count = 0;
892  }
893  } else {
894  UP.Count = TripCount;
895  }
896  if (UP.Count > UP.MaxCount)
897  UP.Count = UP.MaxCount;
898  if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
899  UP.Count != TripCount)
900  ORE->emit([&]() {
901  return OptimizationRemarkMissed(DEBUG_TYPE,
902  "FullUnrollAsDirectedTooLarge",
903  L->getStartLoc(), L->getHeader())
904  << "Unable to fully unroll loop as directed by unroll pragma "
905  "because "
906  "unrolled size is too large.";
907  });
908  LLVM_DEBUG(dbgs() << " partially unrolling with count: " << UP.Count
909  << "\n");
910  return ExplicitUnroll;
911  }
912  assert(TripCount == 0 &&
913  "All cases when TripCount is constant should be covered here.");
914  if (PragmaFullUnroll)
915  ORE->emit([&]() {
917  DEBUG_TYPE, "CantFullUnrollAsDirectedRuntimeTripCount",
918  L->getStartLoc(), L->getHeader())
919  << "Unable to fully unroll loop as directed by unroll(full) "
920  "pragma "
921  "because loop has a runtime trip count.";
922  });
923 
924  // 6th priority is runtime unrolling.
925  // Don't unroll a runtime trip count loop when it is disabled.
927  UP.Count = 0;
928  return false;
929  }
930 
931  // Don't unroll a small upper bound loop unless user or TTI asked to do so.
932  if (MaxTripCount && !UP.Force && MaxTripCount < UnrollMaxUpperBound) {
933  UP.Count = 0;
934  return false;
935  }
936 
937  // Check if the runtime trip count is too small when profile is available.
938  if (L->getHeader()->getParent()->hasProfileData()) {
939  if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
940  if (*ProfileTripCount < FlatLoopTripCountThreshold)
941  return false;
942  else
943  UP.AllowExpensiveTripCount = true;
944  }
945  }
946 
947  // Reduce count based on the type of unrolling and the threshold values.
948  UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
949  if (!UP.Runtime) {
950  LLVM_DEBUG(
951  dbgs() << " will not try to unroll loop with runtime trip count "
952  << "-unroll-runtime not given\n");
953  UP.Count = 0;
954  return false;
955  }
956  if (UP.Count == 0)
958 
959  // Reduce unroll count to be the largest power-of-two factor of
960  // the original count which satisfies the threshold limit.
961  while (UP.Count != 0 &&
962  getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
963  UP.Count >>= 1;
964 
965 #ifndef NDEBUG
966  unsigned OrigCount = UP.Count;
967 #endif
968 
969  if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) {
970  while (UP.Count != 0 && TripMultiple % UP.Count != 0)
971  UP.Count >>= 1;
972  LLVM_DEBUG(
973  dbgs() << "Remainder loop is restricted (that could architecture "
974  "specific or because the loop contains a convergent "
975  "instruction), so unroll count must divide the trip "
976  "multiple, "
977  << TripMultiple << ". Reducing unroll count from " << OrigCount
978  << " to " << UP.Count << ".\n");
979 
980  using namespace ore;
981 
982  if (PragmaCount > 0 && !UP.AllowRemainder)
983  ORE->emit([&]() {
985  "DifferentUnrollCountFromDirected",
986  L->getStartLoc(), L->getHeader())
987  << "Unable to unroll loop the number of times directed by "
988  "unroll_count pragma because remainder loop is restricted "
989  "(that could architecture specific or because the loop "
990  "contains a convergent instruction) and so must have an "
991  "unroll "
992  "count that divides the loop trip multiple of "
993  << NV("TripMultiple", TripMultiple) << ". Unrolling instead "
994  << NV("UnrollCount", UP.Count) << " time(s).";
995  });
996  }
997 
998  if (UP.Count > UP.MaxCount)
999  UP.Count = UP.MaxCount;
1000 
1001  if (MaxTripCount && UP.Count > MaxTripCount)
1002  UP.Count = MaxTripCount;
1003 
1004  LLVM_DEBUG(dbgs() << " runtime unrolling with count: " << UP.Count
1005  << "\n");
1006  if (UP.Count < 2)
1007  UP.Count = 0;
1008  return ExplicitUnroll;
1009 }
1010 
1012  Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
1013  const TargetTransformInfo &TTI, AssumptionCache &AC,
1015  ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel,
1016  bool OnlyWhenForced, bool ForgetAllSCEV, Optional<unsigned> ProvidedCount,
1017  Optional<unsigned> ProvidedThreshold, Optional<bool> ProvidedAllowPartial,
1018  Optional<bool> ProvidedRuntime, Optional<bool> ProvidedUpperBound,
1019  Optional<bool> ProvidedAllowPeeling,
1020  Optional<bool> ProvidedAllowProfileBasedPeeling,
1021  Optional<unsigned> ProvidedFullUnrollMaxCount) {
1022  LLVM_DEBUG(dbgs() << "Loop Unroll: F["
1023  << L->getHeader()->getParent()->getName() << "] Loop %"
1024  << L->getHeader()->getName() << "\n");
1026  if (TM & TM_Disable)
1028  if (!L->isLoopSimplifyForm()) {
1029  LLVM_DEBUG(
1030  dbgs() << " Not unrolling loop which is not in loop-simplify form.\n");
1032  }
1033 
1034  // When automtatic unrolling is disabled, do not unroll unless overridden for
1035  // this loop.
1036  if (OnlyWhenForced && !(TM & TM_Enable))
1038 
1039  bool OptForSize = L->getHeader()->getParent()->hasOptSize();
1040  unsigned NumInlineCandidates;
1041  bool NotDuplicatable;
1042  bool Convergent;
1044  L, SE, TTI, BFI, PSI, OptLevel, ProvidedThreshold, ProvidedCount,
1045  ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
1046  ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling,
1047  ProvidedFullUnrollMaxCount);
1048 
1049  // Exit early if unrolling is disabled. For OptForSize, we pick the loop size
1050  // as threshold later on.
1051  if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0) &&
1052  !OptForSize)
1054 
1056  CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
1057 
1058  unsigned LoopSize =
1059  ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
1060  TTI, EphValues, UP.BEInsns);
1061  LLVM_DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n");
1062  if (NotDuplicatable) {
1063  LLVM_DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable"
1064  << " instructions.\n");
1066  }
1067 
1068  // When optimizing for size, use LoopSize + 1 as threshold (we use < Threshold
1069  // later), to (fully) unroll loops, if it does not increase code size.
1070  if (OptForSize)
1071  UP.Threshold = std::max(UP.Threshold, LoopSize + 1);
1072 
1073  if (NumInlineCandidates != 0) {
1074  LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
1076  }
1077 
1078  // Find trip count and trip multiple if count is not available
1079  unsigned TripCount = 0;
1080  unsigned TripMultiple = 1;
1081  // If there are multiple exiting blocks but one of them is the latch, use the
1082  // latch for the trip count estimation. Otherwise insist on a single exiting
1083  // block for the trip count estimation.
1084  BasicBlock *ExitingBlock = L->getLoopLatch();
1085  if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
1086  ExitingBlock = L->getExitingBlock();
1087  if (ExitingBlock) {
1088  TripCount = SE.getSmallConstantTripCount(L, ExitingBlock);
1089  TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock);
1090  }
1091 
1092  // If the loop contains a convergent operation, the prelude we'd add
1093  // to do the first few instructions before we hit the unrolled loop
1094  // is unsafe -- it adds a control-flow dependency to the convergent
1095  // operation. Therefore restrict remainder loop (try unrollig without).
1096  //
1097  // TODO: This is quite conservative. In practice, convergent_op()
1098  // is likely to be called unconditionally in the loop. In this
1099  // case, the program would be ill-formed (on most architectures)
1100  // unless n were the same on all threads in a thread group.
1101  // Assuming n is the same on all threads, any kind of unrolling is
1102  // safe. But currently llvm's notion of convergence isn't powerful
1103  // enough to express this.
1104  if (Convergent)
1105  UP.AllowRemainder = false;
1106 
1107  // Try to find the trip count upper bound if we cannot find the exact trip
1108  // count.
1109  unsigned MaxTripCount = 0;
1110  bool MaxOrZero = false;
1111  if (!TripCount) {
1112  MaxTripCount = SE.getSmallConstantMaxTripCount(L);
1113  MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
1114  }
1115 
1116  // computeUnrollCount() decides whether it is beneficial to use upper bound to
1117  // fully unroll the loop.
1118  bool UseUpperBound = false;
1119  bool IsCountSetExplicitly = computeUnrollCount(
1120  L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount, MaxOrZero,
1121  TripMultiple, LoopSize, UP, UseUpperBound);
1122  if (!UP.Count)
1124  // Unroll factor (Count) must be less or equal to TripCount.
1125  if (TripCount && UP.Count > TripCount)
1126  UP.Count = TripCount;
1127 
1128  // Save loop properties before it is transformed.
1129  MDNode *OrigLoopID = L->getLoopID();
1130 
1131  // Unroll the loop.
1132  Loop *RemainderLoop = nullptr;
1133  LoopUnrollResult UnrollResult = UnrollLoop(
1134  L,
1135  {UP.Count, TripCount, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
1136  UseUpperBound, MaxOrZero, TripMultiple, UP.PeelCount, UP.UnrollRemainder,
1137  ForgetAllSCEV},
1138  LI, &SE, &DT, &AC, &ORE, PreserveLCSSA, &RemainderLoop);
1139  if (UnrollResult == LoopUnrollResult::Unmodified)
1141 
1142  if (RemainderLoop) {
1143  Optional<MDNode *> RemainderLoopID =
1146  if (RemainderLoopID.hasValue())
1147  RemainderLoop->setLoopID(RemainderLoopID.getValue());
1148  }
1149 
1150  if (UnrollResult != LoopUnrollResult::FullyUnrolled) {
1151  Optional<MDNode *> NewLoopID =
1154  if (NewLoopID.hasValue()) {
1155  L->setLoopID(NewLoopID.getValue());
1156 
1157  // Do not setLoopAlreadyUnrolled if loop attributes have been specified
1158  // explicitly.
1159  return UnrollResult;
1160  }
1161  }
1162 
1163  // If loop has an unroll count pragma or unrolled by explicitly set count
1164  // mark loop as unrolled to prevent unrolling beyond that requested.
1165  // If the loop was peeled, we already "used up" the profile information
1166  // we had, so we don't want to unroll or peel again.
1167  if (UnrollResult != LoopUnrollResult::FullyUnrolled &&
1168  (IsCountSetExplicitly || (UP.PeelProfiledIterations && UP.PeelCount)))
1170 
1171  return UnrollResult;
1172 }
1173 
1174 namespace {
1175 
1176 class LoopUnroll : public LoopPass {
1177 public:
1178  static char ID; // Pass ID, replacement for typeid
1179 
1180  int OptLevel;
1181 
1182  /// If false, use a cost model to determine whether unrolling of a loop is
1183  /// profitable. If true, only loops that explicitly request unrolling via
1184  /// metadata are considered. All other loops are skipped.
1185  bool OnlyWhenForced;
1186 
1187  /// If false, when SCEV is invalidated, only forget everything in the
1188  /// top-most loop (call forgetTopMostLoop), of the loop being processed.
1189  /// Otherwise, forgetAllLoops and rebuild when needed next.
1190  bool ForgetAllSCEV;
1191 
1192  Optional<unsigned> ProvidedCount;
1193  Optional<unsigned> ProvidedThreshold;
1194  Optional<bool> ProvidedAllowPartial;
1195  Optional<bool> ProvidedRuntime;
1196  Optional<bool> ProvidedUpperBound;
1197  Optional<bool> ProvidedAllowPeeling;
1198  Optional<bool> ProvidedAllowProfileBasedPeeling;
1199  Optional<unsigned> ProvidedFullUnrollMaxCount;
1200 
1201  LoopUnroll(int OptLevel = 2, bool OnlyWhenForced = false,
1202  bool ForgetAllSCEV = false, Optional<unsigned> Threshold = None,
1203  Optional<unsigned> Count = None,
1204  Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
1205  Optional<bool> UpperBound = None,
1206  Optional<bool> AllowPeeling = None,
1207  Optional<bool> AllowProfileBasedPeeling = None,
1208  Optional<unsigned> ProvidedFullUnrollMaxCount = None)
1209  : LoopPass(ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced),
1210  ForgetAllSCEV(ForgetAllSCEV), ProvidedCount(std::move(Count)),
1211  ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
1212  ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound),
1213  ProvidedAllowPeeling(AllowPeeling),
1214  ProvidedAllowProfileBasedPeeling(AllowProfileBasedPeeling),
1215  ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount) {
1217  }
1218 
1219  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
1220  if (skipLoop(L))
1221  return false;
1222 
1223  Function &F = *L->getHeader()->getParent();
1224 
1225  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1226  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1227  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1228  const TargetTransformInfo &TTI =
1229  getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1230  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1231  // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
1232  // pass. Function analyses need to be preserved across loop transformations
1233  // but ORE cannot be preserved (see comment before the pass definition).
1234  OptimizationRemarkEmitter ORE(&F);
1235  bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
1236 
1238  L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, PreserveLCSSA, OptLevel,
1239  OnlyWhenForced, ForgetAllSCEV, ProvidedCount, ProvidedThreshold,
1240  ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
1241  ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling,
1242  ProvidedFullUnrollMaxCount);
1243 
1244  if (Result == LoopUnrollResult::FullyUnrolled)
1245  LPM.markLoopAsDeleted(*L);
1246 
1247  return Result != LoopUnrollResult::Unmodified;
1248  }
1249 
1250  /// This transformation requires natural loop information & requires that
1251  /// loop preheaders be inserted into the CFG...
1252  void getAnalysisUsage(AnalysisUsage &AU) const override {
1255  // FIXME: Loop passes are required to preserve domtree, and for now we just
1256  // recreate dom info if anything gets unrolled.
1258  }
1259 };
1260 
1261 } // end anonymous namespace
1262 
1263 char LoopUnroll::ID = 0;
1264 
1265 INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
1269 INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
1270 
1271 Pass *llvm::createLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
1272  bool ForgetAllSCEV, int Threshold, int Count,
1273  int AllowPartial, int Runtime, int UpperBound,
1274  int AllowPeeling) {
1275  // TODO: It would make more sense for this function to take the optionals
1276  // directly, but that's dangerous since it would silently break out of tree
1277  // callers.
1278  return new LoopUnroll(
1279  OptLevel, OnlyWhenForced, ForgetAllSCEV,
1280  Threshold == -1 ? None : Optional<unsigned>(Threshold),
1281  Count == -1 ? None : Optional<unsigned>(Count),
1282  AllowPartial == -1 ? None : Optional<bool>(AllowPartial),
1283  Runtime == -1 ? None : Optional<bool>(Runtime),
1284  UpperBound == -1 ? None : Optional<bool>(UpperBound),
1285  AllowPeeling == -1 ? None : Optional<bool>(AllowPeeling));
1286 }
1287 
1288 Pass *llvm::createSimpleLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
1289  bool ForgetAllSCEV) {
1290  return createLoopUnrollPass(OptLevel, OnlyWhenForced, ForgetAllSCEV, -1, -1,
1291  0, 0, 0, 0);
1292 }
1293 
1296  LPMUpdater &Updater) {
1297  const auto &FAM =
1298  AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
1299  Function *F = L.getHeader()->getParent();
1300 
1301  auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
1302  // FIXME: This should probably be optional rather than required.
1303  if (!ORE)
1305  "LoopFullUnrollPass: OptimizationRemarkEmitterAnalysis not "
1306  "cached at a higher level");
1307 
1308  // Keep track of the previous loop structure so we can identify new loops
1309  // created by unrolling.
1310  Loop *ParentL = L.getParentLoop();
1311  SmallPtrSet<Loop *, 4> OldLoops;
1312  if (ParentL)
1313  OldLoops.insert(ParentL->begin(), ParentL->end());
1314  else
1315  OldLoops.insert(AR.LI.begin(), AR.LI.end());
1316 
1317  std::string LoopName = L.getName();
1318 
1319  bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE,
1320  /*BFI*/ nullptr, /*PSI*/ nullptr,
1321  /*PreserveLCSSA*/ true, OptLevel,
1322  OnlyWhenForced, ForgetSCEV, /*Count*/ None,
1323  /*Threshold*/ None, /*AllowPartial*/ false,
1324  /*Runtime*/ false, /*UpperBound*/ false,
1325  /*AllowPeeling*/ false,
1326  /*AllowProfileBasedPeeling*/ false,
1327  /*FullUnrollMaxCount*/ None) !=
1329  if (!Changed)
1330  return PreservedAnalyses::all();
1331 
1332  // The parent must not be damaged by unrolling!
1333 #ifndef NDEBUG
1334  if (ParentL)
1335  ParentL->verifyLoop();
1336 #endif
1337 
1338  // Unrolling can do several things to introduce new loops into a loop nest:
1339  // - Full unrolling clones child loops within the current loop but then
1340  // removes the current loop making all of the children appear to be new
1341  // sibling loops.
1342  //
1343  // When a new loop appears as a sibling loop after fully unrolling,
1344  // its nesting structure has fundamentally changed and we want to revisit
1345  // it to reflect that.
1346  //
1347  // When unrolling has removed the current loop, we need to tell the
1348  // infrastructure that it is gone.
1349  //
1350  // Finally, we support a debugging/testing mode where we revisit child loops
1351  // as well. These are not expected to require further optimizations as either
1352  // they or the loop they were cloned from have been directly visited already.
1353  // But the debugging mode allows us to check this assumption.
1354  bool IsCurrentLoopValid = false;
1355  SmallVector<Loop *, 4> SibLoops;
1356  if (ParentL)
1357  SibLoops.append(ParentL->begin(), ParentL->end());
1358  else
1359  SibLoops.append(AR.LI.begin(), AR.LI.end());
1360  erase_if(SibLoops, [&](Loop *SibLoop) {
1361  if (SibLoop == &L) {
1362  IsCurrentLoopValid = true;
1363  return true;
1364  }
1365 
1366  // Otherwise erase the loop from the list if it was in the old loops.
1367  return OldLoops.count(SibLoop) != 0;
1368  });
1369  Updater.addSiblingLoops(SibLoops);
1370 
1371  if (!IsCurrentLoopValid) {
1372  Updater.markLoopAsDeleted(L, LoopName);
1373  } else {
1374  // We can only walk child loops if the current loop remained valid.
1376  // Walk *all* of the child loops.
1377  SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());
1378  Updater.addChildLoops(ChildLoops);
1379  }
1380  }
1381 
1383 }
1384 
1385 template <typename RangeT>
1387  SmallVector<Loop *, 8> Worklist;
1388  // We use an internal worklist to build up the preorder traversal without
1389  // recursion.
1390  SmallVector<Loop *, 4> PreOrderLoops, PreOrderWorklist;
1391 
1392  for (Loop *RootL : Loops) {
1393  assert(PreOrderLoops.empty() && "Must start with an empty preorder walk.");
1394  assert(PreOrderWorklist.empty() &&
1395  "Must start with an empty preorder walk worklist.");
1396  PreOrderWorklist.push_back(RootL);
1397  do {
1398  Loop *L = PreOrderWorklist.pop_back_val();
1399  PreOrderWorklist.append(L->begin(), L->end());
1400  PreOrderLoops.push_back(L);
1401  } while (!PreOrderWorklist.empty());
1402 
1403  Worklist.append(PreOrderLoops.begin(), PreOrderLoops.end());
1404  PreOrderLoops.clear();
1405  }
1406  return Worklist;
1407 }
1408 
1411  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
1412  auto &LI = AM.getResult<LoopAnalysis>(F);
1413  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
1414  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
1415  auto &AC = AM.getResult<AssumptionAnalysis>(F);
1417 
1418  LoopAnalysisManager *LAM = nullptr;
1419  if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F))
1420  LAM = &LAMProxy->getManager();
1421 
1422  const ModuleAnalysisManager &MAM =
1424  ProfileSummaryInfo *PSI =
1426  auto *BFI = (PSI && PSI->hasProfileSummary()) ?
1427  &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
1428 
1429  bool Changed = false;
1430 
1431  // The unroller requires loops to be in simplified form, and also needs LCSSA.
1432  // Since simplification may add new inner loops, it has to run before the
1433  // legality and profitability checks. This means running the loop unroller
1434  // will simplify all loops, regardless of whether anything end up being
1435  // unrolled.
1436  for (auto &L : LI) {
1437  Changed |=
1438  simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */);
1439  Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
1440  }
1441 
1443 
1444  while (!Worklist.empty()) {
1445  // Because the LoopInfo stores the loops in RPO, we walk the worklist
1446  // from back to front so that we work forward across the CFG, which
1447  // for unrolling is only needed to get optimization remarks emitted in
1448  // a forward order.
1449  Loop &L = *Worklist.pop_back_val();
1450 #ifndef NDEBUG
1451  Loop *ParentL = L.getParentLoop();
1452 #endif
1453 
1454  // Check if the profile summary indicates that the profiled application
1455  // has a huge working set size, in which case we disable peeling to avoid
1456  // bloating it further.
1457  Optional<bool> LocalAllowPeeling = UnrollOpts.AllowPeeling;
1458  if (PSI && PSI->hasHugeWorkingSetSize())
1459  LocalAllowPeeling = false;
1460  std::string LoopName = L.getName();
1461  // The API here is quite complex to call and we allow to select some
1462  // flavors of unrolling during construction time (by setting UnrollOpts).
1464  &L, DT, &LI, SE, TTI, AC, ORE, BFI, PSI,
1465  /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, UnrollOpts.OnlyWhenForced,
1466  UnrollOpts.ForgetSCEV, /*Count*/ None,
1467  /*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime,
1468  UnrollOpts.AllowUpperBound, LocalAllowPeeling,
1469  UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount);
1470  Changed |= Result != LoopUnrollResult::Unmodified;
1471 
1472  // The parent must not be damaged by unrolling!
1473 #ifndef NDEBUG
1474  if (Result != LoopUnrollResult::Unmodified && ParentL)
1475  ParentL->verifyLoop();
1476 #endif
1477 
1478  // Clear any cached analysis results for L if we removed it completely.
1479  if (LAM && Result == LoopUnrollResult::FullyUnrolled)
1480  LAM->clear(L, LoopName);
1481  }
1482 
1483  if (!Changed)
1484  return PreservedAnalyses::all();
1485 
1487 }
Pass interface - Implemented by all &#39;passes&#39;.
Definition: Pass.h:80
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value *> &EphValues)
Collect a loop&#39;s ephemeral values (those used only by an assume or similar intrinsics in the loop)...
Definition: CodeMetrics.cpp:70
uint64_t CallInst * C
bool PeelProfiledIterations
Allow peeling basing on profile.
unsigned getSmallConstantTripCount(const Loop *L)
Returns the maximum trip count of the loop if it is a single-exit loop and we can compute a small max...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Pass * createSimpleLoopUnrollPass(int OptLevel=2, bool OnlyWhenForced=false, bool ForgetAllSCEV=false)
bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
Diagnostic information for missed-optimization remarks.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
Definition: LoopInfoImpl.h:209
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
unsigned getSmallConstantTripMultiple(const Loop *L)
Returns the largest constant divisor of the trip count of the loop if it is a single-exit loop and we...
DiagnosticInfoOptimizationBase::Argument NV
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:776
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:139
This class represents lattice values for constants.
Definition: AllocatorList.h:23
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:77
This header provides classes for managing a pipeline of passes over loops in LLVM IR...
cl::opt< bool > ForgetSCEVInLoopUnroll
bool convergent
True if this function contains a call to a convergent function.
Definition: CodeMetrics.h:47
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:622
amdgpu Simplify well known AMD library false FunctionCallee Value const Twine & Name
static bool HasUnrollEnablePragma(const Loop *L)
bool isLCSSAForm(DominatorTree &DT) const
Return true if the Loop is in LCSSA form.
Definition: LoopInfo.cpp:448
Implements a dense probed hash-table based set.
Definition: DenseSet.h:249
Analysis providing profile information.
The main scalar evolution driver.
#define DEBUG_TYPE
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold, but used for partial/runtime unrolling (set to UINT_MAX to disable).
static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost, unsigned MaxPercentThresholdBoost)
This file contains the declarations for metadata subclasses.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:160
An immutable pass that tracks lazily created AssumptionCache objects.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
A cache of @llvm.assume calls within a function.
Analysis pass providing the TargetTransformInfo.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned second
unsigned NumInlineCandidates
The number of calls to internal functions with a single caller.
Definition: CodeMetrics.h:68
Metadata node.
Definition: Metadata.h:863
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:230
F(f)
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1068
bool notDuplicatable
True if this function cannot be duplicated.
Definition: CodeMetrics.h:44
static cl::opt< unsigned > UnrollFullMaxCount("unroll-full-max-count", cl::Hidden, cl::desc("Set the max unroll count for full unrolling, for testing purposes"))
bool shouldOptimizeForSize(Function *F, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
Returns true if function F is suggested to be size-optimized base on the profile. ...
Definition: SizeOpts.cpp:23
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
unsigned getSmallConstantMaxTripCount(const Loop *L)
Returns the upper bound of the loop trip count as a normal unsigned value.
const char *const LLVMLoopUnrollFollowupUnrolled
Definition: UnrollLoop.h:42
void addChildLoops(ArrayRef< Loop *> NewChildLoops)
Loop passes should use this method to indicate they have added new child loops of the current loop...
static cl::opt< unsigned > UnrollPartialThreshold("unroll-partial-threshold", cl::Hidden, cl::desc("The cost threshold for partial loop unrolling"))
bool formLCSSARecursively(Loop &L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition: LCSSA.cpp:385
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:195
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:4429
static cl::opt< bool > UnrollRevisitChildLoops("unroll-revisit-child-loops", cl::Hidden, cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. " "This shouldn't typically be needed as child loops (or their " "clones) were already visited."))
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
Hexagon Hardware Loops
static cl::opt< unsigned > FlatLoopTripCountThreshold("flat-loop-tripcount-threshold", cl::init(5), cl::Hidden, cl::desc("If the runtime tripcount for the loop is lower than the " "threshold, the loop is considered as flat and will be less " "aggressively unrolled."))
static bool HasUnrollFullPragma(const Loop *L)
TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel, Optional< unsigned > UserThreshold, Optional< unsigned > UserCount, Optional< bool > UserAllowPartial, Optional< bool > UserRuntime, Optional< bool > UserUpperBound, Optional< bool > UserAllowPeeling, Optional< bool > UserAllowProfileBasedPeeling, Optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
bool AllowPeeling
Allow peeling off loop iterations.
static cl::opt< unsigned > UnrollCount("unroll-count", cl::Hidden, cl::desc("Use this unroll count for all loops including those with " "unroll_count pragma values, for testing purposes"))
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...
unsigned FullUnrollMaxCount
Set the maximum unrolling factor for full unrolling.
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:1183
static cl::opt< bool > UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden, cl::desc("Unroll loops with run-time trip counts"))
static Optional< EstimatedUnrollCost > analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE, const SmallPtrSetImpl< const Value *> &EphValues, const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize)
Figure out if the loop is worth full unrolling.
BlockT * getHeader() const
Definition: LoopInfo.h:105
static MDNode * GetUnrollMetadataForLoop(const Loop *L, StringRef Name)
The transformation should be applied without considering a cost model.
Definition: LoopUtils.h:228
void computePeelCount(Loop *L, unsigned LoopSize, TargetTransformInfo::UnrollingPreferences &UP, unsigned &TripCount, ScalarEvolution &SE)
void visit(Iterator Start, Iterator End)
Definition: InstVisitor.h:89
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:141
static cl::opt< unsigned > UnrollMaxUpperBound("unroll-max-upperbound", cl::init(8), cl::Hidden, cl::desc("The max of trip count upper bound that is considered in unrolling"))
static bool isEqual(const Function &Caller, const Function &Callee)
void setLoopID(MDNode *LoopID) const
Set the llvm.loop loop id metadata for this loop.
Definition: LoopInfo.cpp:513
const T & getValue() const LLVM_LVALUE_FUNCTION
Definition: Optional.h:255
const char *const LLVMLoopUnrollFollowupRemainder
Definition: UnrollLoop.h:44
This header provides classes for managing per-loop analyses.
static cl::opt< bool > UnrollAllowRemainder("unroll-allow-remainder", cl::Hidden, cl::desc("Allow generation of a loop remainder (extra iterations) " "when unrolling a loop."))
bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, const SmallPtrSetImpl< const Value *> &EphValues, OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned &TripMultiple, unsigned LoopSize, TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound)
static cl::opt< unsigned > UnrollMaxPercentThresholdBoost("unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden, cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied " "to the threshold when aggressively unrolling a loop due to the " "dynamic cost savings. If completely unrolling a loop will reduce " "the total runtime from X to Y, we boost the loop unroll " "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, " "X/Y). This limit avoids excessive code bloat."))
void initializeLoopUnrollPass(PassRegistry &)
static LoopUnrollResult tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, const TargetTransformInfo &TTI, AssumptionCache &AC, OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel, bool OnlyWhenForced, bool ForgetAllSCEV, Optional< unsigned > ProvidedCount, Optional< unsigned > ProvidedThreshold, Optional< bool > ProvidedAllowPartial, Optional< bool > ProvidedRuntime, Optional< bool > ProvidedUpperBound, Optional< bool > ProvidedAllowPeeling, Optional< bool > ProvidedAllowProfileBasedPeeling, Optional< unsigned > ProvidedFullUnrollMaxCount)
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:144
bool AllowRemainder
Allow generation of a loop remainder (extra iterations after unroll).
bool hasProfileSummary()
Returns true if profile summary is available.
The loop was fully unrolled into straight-line code.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
static cl::opt< bool > UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden, cl::desc("Allows loops to be peeled when the dynamic " "trip count is known to be low."))
Wrapper pass for TargetTransformInfo.
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:153
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
LLVM Basic Block Representation.
Definition: BasicBlock.h:57
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop...
Definition: LoopInfo.h:208
Conditional or Unconditional Branch instruction.
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
This is an important base class in LLVM.
Definition: Constant.h:41
static cl::opt< unsigned > UnrollMaxCount("unroll-max-count", cl::Hidden, cl::desc("Set the max unroll count for partial and runtime unrolling, for" "testing purposes"))
This file contains the declarations for the subclasses of Constant, which represent the different fla...
iterator end() const
Definition: LoopInfo.h:904
void setLoopAlreadyUnrolled()
Add llvm.loop.unroll.disable to this loop&#39;s loop id metadata.
Definition: LoopInfo.cpp:525
Machine Trace Metrics
char & LCSSAID
Definition: LCSSA.cpp:467
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:370
Represent the analysis usage information of a pass.
void analyzeBasicBlock(const BasicBlock *BB, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value *> &EphValues)
Add information about a block to the current state.
static cl::opt< unsigned > UnrollThreshold("unroll-threshold", cl::Hidden, cl::desc("The cost threshold for loop unrolling"))
op_range operands()
Definition: User.h:237
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:381
Optional< unsigned > getLoopEstimatedTripCount(Loop *L)
Get a loop&#39;s estimated trip count based on branch weight metadata.
Definition: LoopUtils.cpp:679
bool hasProfileData(bool IncludeSynthetic=false) const
Return true if the function is annotated with profile data.
Definition: Function.h:308
static cl::opt< unsigned > UnrollPeelCount("unroll-peel-count", cl::Hidden, cl::desc("Set the unroll peeling count, for testing purposes"))
void addSiblingLoops(ArrayRef< Loop *> NewSibLoops)
Loop passes should use this method to indicate they have added new sibling loops to the current loop...
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:159
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:611
void markLoopAsDeleted(Loop &L, llvm::StringRef Name)
Loop passes should use this method to indicate they have deleted a loop from the nest.
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
const char *const LLVMLoopUnrollFollowupAll
Definition: UnrollLoop.h:41
The transformation should not be applied.
Definition: LoopUtils.h:231
A function analysis which provides an AssumptionCache.
unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value *> &EphValues, unsigned BEInsns)
ApproximateLoopSize - Approximate the size of the loop.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:115
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:297
Analysis pass which computes BlockFrequencyInfo.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:417
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file. ...
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:389
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static cl::opt< bool > UnrollAllowPartial("unroll-allow-partial", cl::Hidden, cl::desc("Allows loops to be partially unrolled until " "-unroll-threshold loop size is reached."))
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
Definition: PassManager.h:1160
static cl::opt< unsigned > PragmaUnrollThreshold("pragma-unroll-threshold", cl::init(16 *1024), cl::Hidden, cl::desc("Unrolled size limit for loops with an unroll(full) or " "unroll_count pragma."))
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
iterator begin() const
Definition: LoopInfo.h:147
Utility to calculate the size and a few similar metrics for a set of basic blocks.
Definition: CodeMetrics.h:32
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:47
static unsigned UnrollCountPragmaValue(const Loop *L)
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:374
void markLoopAsDeleted(Loop &L)
Definition: LoopPass.cpp:143
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
static uint64_t getUnrolledLoopSize(unsigned LoopSize, TargetTransformInfo::UnrollingPreferences &UP)
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2&#39;s erase_if which is equivalent t...
Definition: STLExtras.h:1332
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
Optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
Definition: LoopUtils.cpp:302
iterator begin() const
Definition: LoopInfo.h:903
amdgpu Simplify well known AMD library false FunctionCallee Callee
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
static cl::opt< bool > UnrollUnrollRemainder("unroll-remainder", cl::Hidden, cl::desc("Allow the loop remainder to be unrolled."))
static cl::opt< unsigned > Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"), cl::init(100), cl::Hidden)
void append(in_iter in_start, in_iter in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:387
MDNode * GetUnrollMetadata(MDNode *LoopID, StringRef Name)
Given an llvm.loop loop id metadata node, returns the loop hint metadata node with the given name (fo...
Definition: LoopUnroll.cpp:962
Analysis pass that exposes the ScalarEvolution for a function.
static const unsigned NoThreshold
A magic value for use with the Threshold parameter to indicate that the loop unroll should be perform...
LoopT * getParentLoop() const
Definition: LoopInfo.h:106
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
bool hasValue() const
Definition: Optional.h:259
bool isLoopSimplifyForm() const
Return true if the Loop is in the form that the LoopSimplify form transforms loops to...
Definition: LoopInfo.cpp:464
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
Definition: LoopInfo.cpp:489
unsigned PeelCount
A forced peeling factor (the number of bodied of the original loop that should be peeled off before t...
unsigned Threshold
The cost threshold for the unrolled loop.
int getUserCost(const User *U, ArrayRef< const Value *> Operands) const
Estimate the cost of a given IR user when lowered.
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:55
StringRef getName() const
Definition: LoopInfo.h:827
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:509
Parameters that control the generic loop unrolling transformation.
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable)...
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:106
#define I(x, y, z)
Definition: MD5.cpp:58
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:72
LoopUnrollResult UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, bool PreserveLCSSA, Loop **RemainderLoop=nullptr)
Unroll the given loop by Count.
Definition: LoopUnroll.cpp:276
TransformationMode hasUnrollTransformation(Loop *L)
Definition: LoopUtils.cpp:391
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
void getLoopAnalysisUsage(AnalysisUsage &AU)
Helper to consistently add the set of standard passes to a loop pass&#39;s AnalysisUsage.
Definition: LoopUtils.cpp:138
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:795
iterator end() const
Definition: LoopInfo.h:148
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:332
TransformationMode
The mode sets how eager a transformation should be applied.
Definition: LoopUtils.h:222
static SmallVector< Loop *, 8 > appendLoopsToWorklist(RangeT &&Loops)
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:185
Pass * createLoopUnrollPass(int OptLevel=2, bool OnlyWhenForced=false, bool ForgetAllSCEV=false, int Threshold=-1, int Count=-1, int AllowPartial=-1, int Runtime=-1, int UpperBound=-1, int AllowPeeling=-1)
bool empty() const
Definition: LoopInfo.h:151
Multiway switch.
static bool HasRuntimeUnrollDisablePragma(const Loop *L)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool hasHugeWorkingSetSize()
Returns true if the working set size of the code is considered huge.
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:575
LLVM Value Representation.
Definition: Value.h:74
succ_range successors(Instruction *I)
Definition: CFG.h:259
unsigned MaxPercentThresholdBoost
If complete unrolling will reduce the cost of the loop, we will boost the Threshold by a certain perc...
bool isLoweredToCall(const Function *F) const
Test whether calls to a function lower to actual program function calls.
The loop was not modified.
static cl::opt< unsigned > UnrollMaxIterationsCountToAnalyze("unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden, cl::desc("Don't allow loop unrolling to simulate more than this number of" "iterations when checking full unroll profitability"))
bool isBackedgeTakenCountMaxOrZero(const Loop *L)
Return true if the backedge taken count is either the value returned by getConstantMaxBackedgeTakenCo...
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
bool UpperBound
Allow using trip count upper bound to unroll loops.
void verifyLoop() const
Verify loop structure.
Definition: LoopInfoImpl.h:279
void getUnrollingPreferences(Loop *L, ScalarEvolution &, UnrollingPreferences &UP) const
Get target-customized preferences for the generic loop unrolling transformation.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
A container for analyses that lazily runs them and caches their results.
This pass exposes codegen information to IR-level passes.
This header defines various interfaces for pass management in LLVM.
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1074
#define LLVM_DEBUG(X)
Definition: Debug.h:122
unsigned NumInsts
Number of instructions in the analyzed blocks.
Definition: CodeMetrics.h:53
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:161
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
Definition: LoopInfoImpl.h:49
The optimization diagnostic interface.
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:1044
LoopUnrollResult
Represents the result of a UnrollLoop invocation.
Definition: UnrollLoop.h:53