LLVM  4.0.0
LoopSink.cpp
Go to the documentation of this file.
1 //===-- LoopSink.cpp - Loop Sink Pass ------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass does the inverse transformation of what LICM does.
11 // It traverses all of the instructions in the loop's preheader and sinks
12 // them to the loop body where frequency is lower than the loop's preheader.
13 // This pass is a reverse-transformation of LICM. It differs from the Sink
14 // pass in the following ways:
15 //
16 // * It only handles sinking of instructions from the loop's preheader to the
17 // loop's body
18 // * It uses alias set tracker to get more accurate alias info
19 // * It uses block frequency info to find the optimal sinking locations
20 //
21 // Overall algorithm:
22 //
23 // For I in Preheader:
24 // InsertBBs = BBs that uses I
25 // For BB in sorted(LoopBBs):
26 // DomBBs = BBs in InsertBBs that are dominated by BB
27 // if freq(DomBBs) > freq(BB)
28 // InsertBBs = UseBBs - DomBBs + BB
29 // For BB in InsertBBs:
30 // Insert I at BB's beginning
31 //===----------------------------------------------------------------------===//
32 
33 #include "llvm/ADT/Statistic.h"
38 #include "llvm/Analysis/Loads.h"
39 #include "llvm/Analysis/LoopInfo.h"
40 #include "llvm/Analysis/LoopPass.h"
43 #include "llvm/IR/Dominators.h"
44 #include "llvm/IR/Instructions.h"
45 #include "llvm/IR/LLVMContext.h"
46 #include "llvm/IR/Metadata.h"
48 #include "llvm/Transforms/Scalar.h"
52 using namespace llvm;
53 
54 #define DEBUG_TYPE "loopsink"
55 
56 STATISTIC(NumLoopSunk, "Number of instructions sunk into loop");
57 STATISTIC(NumLoopSunkCloned, "Number of cloned instructions sunk into loop");
58 
60  "sink-freq-percent-threshold", cl::Hidden, cl::init(90),
61  cl::desc("Do not sink instructions that require cloning unless they "
62  "execute less than this percent of the time."));
63 
65  "max-uses-for-sinking", cl::Hidden, cl::init(30),
66  cl::desc("Do not sink instructions that have too many uses."));
67 
68 /// Return adjusted total frequency of \p BBs.
69 ///
70 /// * If there is only one BB, sinking instruction will not introduce code
71 /// size increase. Thus there is no need to adjust the frequency.
72 /// * If there are more than one BB, sinking would lead to code size increase.
73 /// In this case, we add some "tax" to the total frequency to make it harder
74 /// to sink. E.g.
75 /// Freq(Preheader) = 100
76 /// Freq(BBs) = sum(50, 49) = 99
77 /// Even if Freq(BBs) < Freq(Preheader), we will not sink from Preheade to
78 /// BBs as the difference is too small to justify the code size increase.
79 /// To model this, The adjusted Freq(BBs) will be:
80 /// AdjustedFreq(BBs) = 99 / SinkFrequencyPercentThreshold%
83  BlockFrequency T = 0;
84  for (BasicBlock *B : BBs)
85  T += BFI.getBlockFreq(B);
86  if (BBs.size() > 1)
88  return T;
89 }
90 
91 /// Return a set of basic blocks to insert sinked instructions.
92 ///
93 /// The returned set of basic blocks (BBsToSinkInto) should satisfy:
94 ///
95 /// * Inside the loop \p L
96 /// * For each UseBB in \p UseBBs, there is at least one BB in BBsToSinkInto
97 /// that domintates the UseBB
98 /// * Has minimum total frequency that is no greater than preheader frequency
99 ///
100 /// The purpose of the function is to find the optimal sinking points to
101 /// minimize execution cost, which is defined as "sum of frequency of
102 /// BBsToSinkInto".
103 /// As a result, the returned BBsToSinkInto needs to have minimum total
104 /// frequency.
105 /// Additionally, if the total frequency of BBsToSinkInto exceeds preheader
106 /// frequency, the optimal solution is not sinking (return empty set).
107 ///
108 /// \p ColdLoopBBs is used to help find the optimal sinking locations.
109 /// It stores a list of BBs that is:
110 ///
111 /// * Inside the loop \p L
112 /// * Has a frequency no larger than the loop's preheader
113 /// * Sorted by BB frequency
114 ///
115 /// The complexity of the function is O(UseBBs.size() * ColdLoopBBs.size()).
116 /// To avoid expensive computation, we cap the maximum UseBBs.size() in its
117 /// caller.
120  const SmallVectorImpl<BasicBlock *> &ColdLoopBBs,
122  SmallPtrSet<BasicBlock *, 2> BBsToSinkInto;
123  if (UseBBs.size() == 0)
124  return BBsToSinkInto;
125 
126  BBsToSinkInto.insert(UseBBs.begin(), UseBBs.end());
127  SmallPtrSet<BasicBlock *, 2> BBsDominatedByColdestBB;
128 
129  // For every iteration:
130  // * Pick the ColdestBB from ColdLoopBBs
131  // * Find the set BBsDominatedByColdestBB that satisfy:
132  // - BBsDominatedByColdestBB is a subset of BBsToSinkInto
133  // - Every BB in BBsDominatedByColdestBB is dominated by ColdestBB
134  // * If Freq(ColdestBB) < Freq(BBsDominatedByColdestBB), remove
135  // BBsDominatedByColdestBB from BBsToSinkInto, add ColdestBB to
136  // BBsToSinkInto
137  for (BasicBlock *ColdestBB : ColdLoopBBs) {
138  BBsDominatedByColdestBB.clear();
139  for (BasicBlock *SinkedBB : BBsToSinkInto)
140  if (DT.dominates(ColdestBB, SinkedBB))
141  BBsDominatedByColdestBB.insert(SinkedBB);
142  if (BBsDominatedByColdestBB.size() == 0)
143  continue;
144  if (adjustedSumFreq(BBsDominatedByColdestBB, BFI) >
145  BFI.getBlockFreq(ColdestBB)) {
146  for (BasicBlock *DominatedBB : BBsDominatedByColdestBB) {
147  BBsToSinkInto.erase(DominatedBB);
148  }
149  BBsToSinkInto.insert(ColdestBB);
150  }
151  }
152 
153  // If the total frequency of BBsToSinkInto is larger than preheader frequency,
154  // do not sink.
155  if (adjustedSumFreq(BBsToSinkInto, BFI) >
157  BBsToSinkInto.clear();
158  return BBsToSinkInto;
159 }
160 
161 // Sinks \p I from the loop \p L's preheader to its uses. Returns true if
162 // sinking is successful.
163 // \p LoopBlockNumber is used to sort the insertion blocks to ensure
164 // determinism.
166  const SmallVectorImpl<BasicBlock *> &ColdLoopBBs,
167  const SmallDenseMap<BasicBlock *, int, 16> &LoopBlockNumber,
168  LoopInfo &LI, DominatorTree &DT,
170  // Compute the set of blocks in loop L which contain a use of I.
172  for (auto &U : I.uses()) {
173  Instruction *UI = cast<Instruction>(U.getUser());
174  // We cannot sink I to PHI-uses.
175  if (dyn_cast<PHINode>(UI))
176  return false;
177  // We cannot sink I if it has uses outside of the loop.
178  if (!L.contains(LI.getLoopFor(UI->getParent())))
179  return false;
180  BBs.insert(UI->getParent());
181  }
182 
183  // findBBsToSinkInto is O(BBs.size() * ColdLoopBBs.size()). We cap the max
184  // BBs.size() to avoid expensive computation.
185  // FIXME: Handle code size growth for min_size and opt_size.
186  if (BBs.size() > MaxNumberOfUseBBsForSinking)
187  return false;
188 
189  // Find the set of BBs that we should insert a copy of I.
190  SmallPtrSet<BasicBlock *, 2> BBsToSinkInto =
191  findBBsToSinkInto(L, BBs, ColdLoopBBs, DT, BFI);
192  if (BBsToSinkInto.empty())
193  return false;
194 
195  // Copy the final BBs into a vector and sort them using the total ordering
196  // of the loop block numbers as iterating the set doesn't give a useful
197  // order. No need to stable sort as the block numbers are a total ordering.
198  SmallVector<BasicBlock *, 2> SortedBBsToSinkInto;
199  SortedBBsToSinkInto.insert(SortedBBsToSinkInto.begin(), BBsToSinkInto.begin(),
200  BBsToSinkInto.end());
201  std::sort(SortedBBsToSinkInto.begin(), SortedBBsToSinkInto.end(),
202  [&](BasicBlock *A, BasicBlock *B) {
203  return *LoopBlockNumber.find(A) < *LoopBlockNumber.find(B);
204  });
205 
206  BasicBlock *MoveBB = *SortedBBsToSinkInto.begin();
207  // FIXME: Optimize the efficiency for cloned value replacement. The current
208  // implementation is O(SortedBBsToSinkInto.size() * I.num_uses()).
209  for (BasicBlock *N : SortedBBsToSinkInto) {
210  if (N == MoveBB)
211  continue;
212  // Clone I and replace its uses.
213  Instruction *IC = I.clone();
214  IC->setName(I.getName());
215  IC->insertBefore(&*N->getFirstInsertionPt());
216  // Replaces uses of I with IC in N
217  for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;) {
218  Use &U = *UI++;
219  auto *I = cast<Instruction>(U.getUser());
220  if (I->getParent() == N)
221  U.set(IC);
222  }
223  // Replaces uses of I with IC in blocks dominated by N
224  replaceDominatedUsesWith(&I, IC, DT, N);
225  DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName()
226  << '\n');
227  NumLoopSunkCloned++;
228  }
229  DEBUG(dbgs() << "Sinking " << I << " To: " << MoveBB->getName() << '\n');
230  NumLoopSunk++;
231  I.moveBefore(&*MoveBB->getFirstInsertionPt());
232 
233  return true;
234 }
235 
236 /// Sinks instructions from loop's preheader to the loop body if the
237 /// sum frequency of inserted copy is smaller than preheader's frequency.
239  DominatorTree &DT,
241  ScalarEvolution *SE) {
242  BasicBlock *Preheader = L.getLoopPreheader();
243  if (!Preheader)
244  return false;
245 
246  // Enable LoopSink only when runtime profile is available.
247  // With static profile, the sinking decision may be sub-optimal.
248  if (!Preheader->getParent()->getEntryCount())
249  return false;
250 
251  const BlockFrequency PreheaderFreq = BFI.getBlockFreq(Preheader);
252  // If there are no basic blocks with lower frequency than the preheader then
253  // we can avoid the detailed analysis as we will never find profitable sinking
254  // opportunities.
255  if (all_of(L.blocks(), [&](const BasicBlock *BB) {
256  return BFI.getBlockFreq(BB) > PreheaderFreq;
257  }))
258  return false;
259 
260  bool Changed = false;
261  AliasSetTracker CurAST(AA);
262 
263  // Compute alias set.
264  for (BasicBlock *BB : L.blocks())
265  CurAST.add(*BB);
266 
267  // Sort loop's basic blocks by frequency
268  SmallVector<BasicBlock *, 10> ColdLoopBBs;
269  SmallDenseMap<BasicBlock *, int, 16> LoopBlockNumber;
270  int i = 0;
271  for (BasicBlock *B : L.blocks())
272  if (BFI.getBlockFreq(B) < BFI.getBlockFreq(L.getLoopPreheader())) {
273  ColdLoopBBs.push_back(B);
274  LoopBlockNumber[B] = ++i;
275  }
276  std::stable_sort(ColdLoopBBs.begin(), ColdLoopBBs.end(),
277  [&](BasicBlock *A, BasicBlock *B) {
278  return BFI.getBlockFreq(A) < BFI.getBlockFreq(B);
279  });
280 
281  // Traverse preheader's instructions in reverse order becaue if A depends
282  // on B (A appears after B), A needs to be sinked first before B can be
283  // sinked.
284  for (auto II = Preheader->rbegin(), E = Preheader->rend(); II != E;) {
285  Instruction *I = &*II++;
286  // No need to check for instruction's operands are loop invariant.
288  "Insts in a loop's preheader should have loop invariant operands!");
289  if (!canSinkOrHoistInst(*I, &AA, &DT, &L, &CurAST, nullptr))
290  continue;
291  if (sinkInstruction(L, *I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI))
292  Changed = true;
293  }
294 
295  if (Changed && SE)
296  SE->forgetLoopDispositions(&L);
297  return Changed;
298 }
299 
300 namespace {
301 struct LegacyLoopSinkPass : public LoopPass {
302  static char ID;
303  LegacyLoopSinkPass() : LoopPass(ID) {
305  }
306 
307  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
308  if (skipLoop(L))
309  return false;
310 
311  auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
313  *L, getAnalysis<AAResultsWrapperPass>().getAAResults(),
314  getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
315  getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
316  getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(),
317  SE ? &SE->getSE() : nullptr);
318  }
319 
320  void getAnalysisUsage(AnalysisUsage &AU) const override {
321  AU.setPreservesCFG();
324  }
325 };
326 }
327 
328 char LegacyLoopSinkPass::ID = 0;
329 INITIALIZE_PASS_BEGIN(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false,
330  false)
333 INITIALIZE_PASS_END(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false, false)
334 
335 Pass *llvm::createLoopSinkPass() { return new LegacyLoopSinkPass(); }
MachineLoop * L
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:81
use_iterator use_end()
Definition: Value.h:318
use_iterator_impl< Use > use_iterator
Definition: Value.h:304
iterator_range< use_iterator > uses()
Definition: Value.h:326
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
STATISTIC(NumFunctions,"Total number of functions")
size_t i
This header provides classes for managing a pipeline of passes over loops in LLVM IR...
static cl::opt< unsigned > SinkFrequencyPercentThreshold("sink-freq-percent-threshold", cl::Hidden, cl::init(90), cl::desc("Do not sink instructions that require cloning unless they ""execute less than this percent of the time."))
The main scalar evolution driver.
This file contains the declarations for metadata subclasses.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:736
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:100
bool hasLoopInvariantOperands(const Instruction *I) const
Return true if all the operands of the specified instruction are loop invariant.
Definition: LoopInfo.cpp:61
reverse_iterator rend()
Definition: BasicBlock.h:235
reverse_iterator rbegin()
Definition: BasicBlock.h:233
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition: LoopInfo.h:575
unsigned replaceDominatedUsesWith(Value *From, Value *To, DominatorTree &DT, const BasicBlockEdge &Edge)
Replace each use of 'From' with 'To' if that use is dominated by the given edge.
Definition: Local.cpp:1758
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:191
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:345
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:53
This is the interface for a SCEV-based alias analysis.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:56
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:32
Legacy analysis pass which computes BlockFrequencyInfo.
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:257
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following: ...
#define T
Function Alias Analysis false
static GCRegistry::Add< OcamlGC > B("ocaml","ocaml 3.10-compatible GC")
Optional< uint64_t > getEntryCount() const
Get the entry count for this function.
Definition: Function.cpp:1287
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:96
void forgetLoopDispositions(const Loop *L)
Called when the client has changed the disposition of values in this loop.
static GCRegistry::Add< CoreCLRGC > E("coreclr","CoreCLR-compatible GC")
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:143
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:395
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:109
void set(Value *Val)
Definition: Value.h:624
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction...
Definition: Instruction.cpp:82
LLVM Basic Block Representation.
Definition: BasicBlock.h:51
size_type size() const
Definition: SmallPtrSet.h:99
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator begin()
Definition: SmallVector.h:115
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:368
Represent the analysis usage information of a pass.
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:109
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE,"Assign register bank of generic virtual registers", false, false) RegBankSelect
User * getUser() const
Returns the User that contains this Use.
Definition: Use.cpp:41
iterator begin() const
Definition: SmallPtrSet.h:398
static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI, DominatorTree &DT, BlockFrequencyInfo &BFI, ScalarEvolution *SE)
Sinks instructions from loop's preheader to the loop body if the sum frequency of inserted copy is sm...
Definition: LoopSink.cpp:238
Pass * createLoopSinkPass()
LLVM_NODISCARD bool empty() const
Definition: SmallPtrSet.h:98
bool dominates(const Instruction *Def, const Use &U) const
Return true if Def dominates a use in User.
Definition: Dominators.cpp:218
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:425
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:843
static SmallPtrSet< BasicBlock *, 2 > findBBsToSinkInto(const Loop &L, const SmallPtrSetImpl< BasicBlock * > &UseBBs, const SmallVectorImpl< BasicBlock * > &ColdLoopBBs, DominatorTree &DT, BlockFrequencyInfo &BFI)
Return a set of basic blocks to insert sinked instructions.
Definition: LoopSink.cpp:119
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:276
void add(Value *Ptr, uint64_t Size, const AAMDNodes &AAInfo)
These methods are used to add different types of instructions to the alias sets.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:464
use_iterator use_begin()
Definition: Value.h:310
iterator end() const
Definition: SmallPtrSet.h:405
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator end()
Definition: SmallVector.h:119
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:368
#define I(x, y, z)
Definition: MD5.cpp:54
#define N
void getLoopAnalysisUsage(AnalysisUsage &AU)
Helper to consistently add the set of standard passes to a loop pass's AnalysisUsage.
Definition: LoopUtils.cpp:938
static cl::opt< unsigned > MaxNumberOfUseBBsForSinking("max-uses-for-sinking", cl::Hidden, cl::init(30), cl::desc("Do not sink instructions that have too many uses."))
INITIALIZE_PASS_BEGIN(LegacyLoopSinkPass,"loop-sink","Loop Sink", false, false) Pass *llvm
Definition: LoopSink.cpp:329
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
void initializeLegacyLoopSinkPassPass(PassRegistry &)
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
Definition: Instruction.cpp:95
#define DEBUG(X)
Definition: Debug.h:100
This is the interface for LLVM's primary stateless and local alias analysis.
bool canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, Loop *CurLoop, AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE=nullptr)
Returns true if the hoister and sinker can handle this instruction.
Definition: LICM.cpp:480
machine sink
When an instruction is found to only be used outside of the loop, this function moves it to the exit ...
static GCRegistry::Add< ErlangGC > A("erlang","erlang-compatible garbage collector")
static BlockFrequency adjustedSumFreq(SmallPtrSetImpl< BasicBlock * > &BBs, BlockFrequencyInfo &BFI)
Return adjusted total frequency of BBs.
Definition: LoopSink.cpp:81
const BasicBlock * getParent() const
Definition: Instruction.h:62
static bool sinkInstruction(Loop &L, Instruction &I, const SmallVectorImpl< BasicBlock * > &ColdLoopBBs, const SmallDenseMap< BasicBlock *, int, 16 > &LoopBlockNumber, LoopInfo &LI, DominatorTree &DT, BlockFrequencyInfo &BFI)
Definition: LoopSink.cpp:165
BlockFrequency getBlockFreq(const BasicBlock *BB) const
getblockFreq - Return block frequency.