LLVM  14.0.0git
PartialInlining.cpp
Go to the documentation of this file.
1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass performs partial inlining, typically by inlining an if statement
10 // that surrounds the body of the function.
11 //
12 //===----------------------------------------------------------------------===//
13 
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/DenseSet.h"
17 #include "llvm/ADT/None.h"
18 #include "llvm/ADT/Optional.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Statistic.h"
25 #include "llvm/Analysis/LoopInfo.h"
30 #include "llvm/IR/Attributes.h"
31 #include "llvm/IR/BasicBlock.h"
32 #include "llvm/IR/CFG.h"
33 #include "llvm/IR/DebugLoc.h"
34 #include "llvm/IR/DiagnosticInfo.h"
35 #include "llvm/IR/Dominators.h"
36 #include "llvm/IR/Function.h"
37 #include "llvm/IR/InstrTypes.h"
38 #include "llvm/IR/Instruction.h"
39 #include "llvm/IR/Instructions.h"
40 #include "llvm/IR/IntrinsicInst.h"
41 #include "llvm/IR/Intrinsics.h"
42 #include "llvm/IR/Module.h"
43 #include "llvm/IR/User.h"
44 #include "llvm/InitializePasses.h"
45 #include "llvm/Pass.h"
48 #include "llvm/Support/Casting.h"
51 #include "llvm/Transforms/IPO.h"
55 #include <algorithm>
56 #include <cassert>
57 #include <cstdint>
58 #include <functional>
59 #include <iterator>
60 #include <memory>
61 #include <tuple>
62 #include <vector>
63 
64 using namespace llvm;
65 
66 #define DEBUG_TYPE "partial-inlining"
67 
68 STATISTIC(NumPartialInlined,
69  "Number of callsites functions partially inlined into.");
70 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
71  "cold outlined regions were partially "
72  "inlined into its caller(s).");
73 STATISTIC(NumColdRegionsFound,
74  "Number of cold single entry/exit regions found.");
75 STATISTIC(NumColdRegionsOutlined,
76  "Number of cold single entry/exit regions outlined.");
77 
78 // Command line option to disable partial-inlining. The default is false:
79 static cl::opt<bool>
80  DisablePartialInlining("disable-partial-inlining", cl::init(false),
81  cl::Hidden, cl::desc("Disable partial inlining"));
82 // Command line option to disable multi-region partial-inlining. The default is
83 // false:
85  "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
86  cl::desc("Disable multi-region partial inlining"));
87 
88 // Command line option to force outlining in regions with live exit variables.
89 // The default is false:
90 static cl::opt<bool>
91  ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
92  cl::desc("Force outline regions with live exits"));
93 
94 // Command line option to enable marking outline functions with Cold Calling
95 // Convention. The default is false:
96 static cl::opt<bool>
97  MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
98  cl::desc("Mark outline function calls with ColdCC"));
99 
100 // This is an option used by testing:
101 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
102  cl::init(false), cl::ZeroOrMore,
104  cl::desc("Skip Cost Analysis"));
105 // Used to determine if a cold region is worth outlining based on
106 // its inlining cost compared to the original function. Default is set at 10%.
107 // ie. if the cold region reduces the inlining cost of the original function by
108 // at least 10%.
110  "min-region-size-ratio", cl::init(0.1), cl::Hidden,
111  cl::desc("Minimum ratio comparing relative sizes of each "
112  "outline candidate and original function"));
113 // Used to tune the minimum number of execution counts needed in the predecessor
114 // block to the cold edge. ie. confidence interval.
115 static cl::opt<unsigned>
116  MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
117  cl::desc("Minimum block executions to consider "
118  "its BranchProbabilityInfo valid"));
119 // Used to determine when an edge is considered cold. Default is set to 10%. ie.
120 // if the branch probability is 10% or less, then it is deemed as 'cold'.
122  "cold-branch-ratio", cl::init(0.1), cl::Hidden,
123  cl::desc("Minimum BranchProbability to consider a region cold."));
124 
126  "max-num-inline-blocks", cl::init(5), cl::Hidden,
127  cl::desc("Max number of blocks to be partially inlined"));
128 
129 // Command line option to set the maximum number of partial inlining allowed
130 // for the module. The default value of -1 means no limit.
132  "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
133  cl::desc("Max number of partial inlining. The default is unlimited"));
134 
135 // Used only when PGO or user annotated branch data is absent. It is
136 // the least value that is used to weigh the outline region. If BFI
137 // produces larger value, the BFI value will be used.
138 static cl::opt<int>
139  OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
141  cl::desc("Relative frequency of outline region to "
142  "the entry block"));
143 
145  "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
146  cl::desc("A debug option to add additional penalty to the computed one."));
147 
148 namespace {
149 
150 struct FunctionOutliningInfo {
151  FunctionOutliningInfo() = default;
152 
153  // Returns the number of blocks to be inlined including all blocks
154  // in Entries and one return block.
155  unsigned getNumInlinedBlocks() const { return Entries.size() + 1; }
156 
157  // A set of blocks including the function entry that guard
158  // the region to be outlined.
160 
161  // The return block that is not included in the outlined region.
162  BasicBlock *ReturnBlock = nullptr;
163 
164  // The dominating block of the region to be outlined.
165  BasicBlock *NonReturnBlock = nullptr;
166 
167  // The set of blocks in Entries that that are predecessors to ReturnBlock
168  SmallVector<BasicBlock *, 4> ReturnBlockPreds;
169 };
170 
171 struct FunctionOutliningMultiRegionInfo {
172  FunctionOutliningMultiRegionInfo()
173  : ORI() {}
174 
175  // Container for outline regions
176  struct OutlineRegionInfo {
177  OutlineRegionInfo(ArrayRef<BasicBlock *> Region,
178  BasicBlock *EntryBlock, BasicBlock *ExitBlock,
179  BasicBlock *ReturnBlock)
180  : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock),
181  ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {}
183  BasicBlock *EntryBlock;
184  BasicBlock *ExitBlock;
185  BasicBlock *ReturnBlock;
186  };
187 
189 };
190 
191 struct PartialInlinerImpl {
192 
193  PartialInlinerImpl(
195  function_ref<AssumptionCache *(Function &)> LookupAC,
197  function_ref<const TargetLibraryInfo &(Function &)> GTLI,
198  ProfileSummaryInfo &ProfSI,
199  function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr)
200  : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
201  GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}
202 
203  bool run(Module &M);
204  // Main part of the transformation that calls helper functions to find
205  // outlining candidates, clone & outline the function, and attempt to
206  // partially inline the resulting function. Returns true if
207  // inlining was successful, false otherwise. Also returns the outline
208  // function (only if we partially inlined early returns) as there is a
209  // possibility to further "peel" early return statements that were left in the
210  // outline function due to code size.
211  std::pair<bool, Function *> unswitchFunction(Function &F);
212 
213  // This class speculatively clones the function to be partial inlined.
214  // At the end of partial inlining, the remaining callsites to the cloned
215  // function that are not partially inlined will be fixed up to reference
216  // the original function, and the cloned function will be erased.
217  struct FunctionCloner {
218  // Two constructors, one for single region outlining, the other for
219  // multi-region outlining.
220  FunctionCloner(Function *F, FunctionOutliningInfo *OI,
222  function_ref<AssumptionCache *(Function &)> LookupAC,
224  FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
226  function_ref<AssumptionCache *(Function &)> LookupAC,
228 
229  ~FunctionCloner();
230 
231  // Prepare for function outlining: making sure there is only
232  // one incoming edge from the extracted/outlined region to
233  // the return block.
234  void normalizeReturnBlock() const;
235 
236  // Do function outlining for cold regions.
237  bool doMultiRegionFunctionOutlining();
238  // Do function outlining for region after early return block(s).
239  // NOTE: For vararg functions that do the vararg handling in the outlined
240  // function, we temporarily generate IR that does not properly
241  // forward varargs to the outlined function. Calling InlineFunction
242  // will update calls to the outlined functions to properly forward
243  // the varargs.
244  Function *doSingleRegionFunctionOutlining();
245 
246  Function *OrigFunc = nullptr;
247  Function *ClonedFunc = nullptr;
248 
249  typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
250  // Keep track of Outlined Functions and the basic block they're called from.
251  SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
252 
253  // ClonedFunc is inlined in one of its callers after function
254  // outlining.
255  bool IsFunctionInlined = false;
256  // The cost of the region to be outlined.
257  InstructionCost OutlinedRegionCost = 0;
258  // ClonedOI is specific to outlining non-early return blocks.
259  std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
260  // ClonedOMRI is specific to outlining cold regions.
261  std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
262  std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
264  function_ref<AssumptionCache *(Function &)> LookupAC;
266  };
267 
268 private:
269  int NumPartialInlining = 0;
270  function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
271  function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
274  function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
275  ProfileSummaryInfo &PSI;
276 
277  // Return the frequency of the OutlininingBB relative to F's entry point.
278  // The result is no larger than 1 and is represented using BP.
279  // (Note that the outlined region's 'head' block can only have incoming
280  // edges from the guarding entry blocks).
282  getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const;
283 
284  // Return true if the callee of CB should be partially inlined with
285  // profit.
286  bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
287  BlockFrequency WeightedOutliningRcost,
288  OptimizationRemarkEmitter &ORE) const;
289 
290  // Try to inline DuplicateFunction (cloned from F with call to
291  // the OutlinedFunction into its callers. Return true
292  // if there is any successful inlining.
293  bool tryPartialInline(FunctionCloner &Cloner);
294 
295  // Compute the mapping from use site of DuplicationFunction to the enclosing
296  // BB's profile count.
297  void
298  computeCallsiteToProfCountMap(Function *DuplicateFunction,
299  DenseMap<User *, uint64_t> &SiteCountMap) const;
300 
301  bool isLimitReached() const {
302  return (MaxNumPartialInlining != -1 &&
303  NumPartialInlining >= MaxNumPartialInlining);
304  }
305 
306  static CallBase *getSupportedCallBase(User *U) {
307  if (isa<CallInst>(U) || isa<InvokeInst>(U))
308  return cast<CallBase>(U);
309  llvm_unreachable("All uses must be calls");
310  return nullptr;
311  }
312 
313  static CallBase *getOneCallSiteTo(Function &F) {
314  User *User = *F.user_begin();
315  return getSupportedCallBase(User);
316  }
317 
318  std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &F) const {
319  CallBase *CB = getOneCallSiteTo(F);
320  DebugLoc DLoc = CB->getDebugLoc();
321  BasicBlock *Block = CB->getParent();
322  return std::make_tuple(DLoc, Block);
323  }
324 
325  // Returns the costs associated with function outlining:
326  // - The first value is the non-weighted runtime cost for making the call
327  // to the outlined function, including the addtional setup cost in the
328  // outlined function itself;
329  // - The second value is the estimated size of the new call sequence in
330  // basic block Cloner.OutliningCallBB;
331  std::tuple<InstructionCost, InstructionCost>
332  computeOutliningCosts(FunctionCloner &Cloner) const;
333 
334  // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
335  // approximate both the size and runtime cost (Note that in the current
336  // inline cost analysis, there is no clear distinction there either).
337  static InstructionCost computeBBInlineCost(BasicBlock *BB,
339 
340  std::unique_ptr<FunctionOutliningInfo>
341  computeOutliningInfo(Function &F) const;
342 
343  std::unique_ptr<FunctionOutliningMultiRegionInfo>
344  computeOutliningColdRegionsInfo(Function &F,
345  OptimizationRemarkEmitter &ORE) const;
346 };
347 
348 struct PartialInlinerLegacyPass : public ModulePass {
349  static char ID; // Pass identification, replacement for typeid
350 
351  PartialInlinerLegacyPass() : ModulePass(ID) {
353  }
354 
355  void getAnalysisUsage(AnalysisUsage &AU) const override {
360  }
361 
362  bool runOnModule(Module &M) override {
363  if (skipModule(M))
364  return false;
365 
366  AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
368  &getAnalysis<TargetTransformInfoWrapperPass>();
369  ProfileSummaryInfo &PSI =
370  getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
371 
372  auto GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & {
373  return ACT->getAssumptionCache(F);
374  };
375 
376  auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * {
377  return ACT->lookupAssumptionCache(F);
378  };
379 
380  auto GetTTI = [&TTIWP](Function &F) -> TargetTransformInfo & {
381  return TTIWP->getTTI(F);
382  };
383 
384  auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
385  return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
386  };
387 
388  return PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
389  GetTLI, PSI)
390  .run(M);
391  }
392 };
393 
394 } // end anonymous namespace
395 
396 std::unique_ptr<FunctionOutliningMultiRegionInfo>
397 PartialInlinerImpl::computeOutliningColdRegionsInfo(
398  Function &F, OptimizationRemarkEmitter &ORE) const {
399  BasicBlock *EntryBlock = &F.front();
400 
401  DominatorTree DT(F);
402  LoopInfo LI(DT);
403  BranchProbabilityInfo BPI(F, LI);
404  std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
406  if (!GetBFI) {
407  ScopedBFI.reset(new BlockFrequencyInfo(F, BPI, LI));
408  BFI = ScopedBFI.get();
409  } else
410  BFI = &(GetBFI(F));
411 
412  // Return if we don't have profiling information.
413  if (!PSI.hasInstrumentationProfile())
414  return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
415 
416  std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
417  std::make_unique<FunctionOutliningMultiRegionInfo>();
418 
419  auto IsSingleExit =
420  [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
421  BasicBlock *ExitBlock = nullptr;
422  for (auto *Block : BlockList) {
423  for (BasicBlock *Succ : successors(Block)) {
424  if (!is_contained(BlockList, Succ)) {
425  if (ExitBlock) {
426  ORE.emit([&]() {
427  return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
428  &Succ->front())
429  << "Region dominated by "
430  << ore::NV("Block", BlockList.front()->getName())
431  << " has more than one region exit edge.";
432  });
433  return nullptr;
434  }
435 
436  ExitBlock = Block;
437  }
438  }
439  }
440  return ExitBlock;
441  };
442 
443  auto BBProfileCount = [BFI](BasicBlock *BB) {
444  return BFI->getBlockProfileCount(BB).getValueOr(0);
445  };
446 
447  // Use the same computeBBInlineCost function to compute the cost savings of
448  // the outlining the candidate region.
449  TargetTransformInfo *FTTI = &GetTTI(F);
450  InstructionCost OverallFunctionCost = 0;
451  for (auto &BB : F)
452  OverallFunctionCost += computeBBInlineCost(&BB, FTTI);
453 
454  LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost
455  << "\n";);
456 
457  InstructionCost MinOutlineRegionCost = OverallFunctionCost.map(
458  [&](auto Cost) { return Cost * MinRegionSizeRatio; });
459 
460  BranchProbability MinBranchProbability(
461  static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
463  bool ColdCandidateFound = false;
464  BasicBlock *CurrEntry = EntryBlock;
465  std::vector<BasicBlock *> DFS;
466  DenseMap<BasicBlock *, bool> VisitedMap;
467  DFS.push_back(CurrEntry);
468  VisitedMap[CurrEntry] = true;
469 
470  // Use Depth First Search on the basic blocks to find CFG edges that are
471  // considered cold.
472  // Cold regions considered must also have its inline cost compared to the
473  // overall inline cost of the original function. The region is outlined only
474  // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
475  // more.
476  while (!DFS.empty()) {
477  auto *ThisBB = DFS.back();
478  DFS.pop_back();
479  // Only consider regions with predecessor blocks that are considered
480  // not-cold (default: part of the top 99.99% of all block counters)
481  // AND greater than our minimum block execution count (default: 100).
482  if (PSI.isColdBlock(ThisBB, BFI) ||
483  BBProfileCount(ThisBB) < MinBlockCounterExecution)
484  continue;
485  for (auto SI = succ_begin(ThisBB); SI != succ_end(ThisBB); ++SI) {
486  if (VisitedMap[*SI])
487  continue;
488  VisitedMap[*SI] = true;
489  DFS.push_back(*SI);
490  // If branch isn't cold, we skip to the next one.
491  BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI);
492  if (SuccProb > MinBranchProbability)
493  continue;
494 
495  LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->"
496  << SI->getName()
497  << "\nBranch Probability = " << SuccProb << "\n";);
498 
499  SmallVector<BasicBlock *, 8> DominateVector;
500  DT.getDescendants(*SI, DominateVector);
501  assert(!DominateVector.empty() &&
502  "SI should be reachable and have at least itself as descendant");
503 
504  // We can only outline single entry regions (for now).
505  if (!DominateVector.front()->hasNPredecessors(1)) {
506  LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
507  << " doesn't have a single predecessor in the "
508  "dominator tree\n";);
509  continue;
510  }
511 
512  BasicBlock *ExitBlock = nullptr;
513  // We can only outline single exit regions (for now).
514  if (!(ExitBlock = IsSingleExit(DominateVector))) {
515  LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
516  << " doesn't have a unique successor\n";);
517  continue;
518  }
519 
520  InstructionCost OutlineRegionCost = 0;
521  for (auto *BB : DominateVector)
522  OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
523 
524  LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost
525  << "\n";);
526 
527  if (!SkipCostAnalysis && OutlineRegionCost < MinOutlineRegionCost) {
528  ORE.emit([&]() {
529  return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
530  &SI->front())
531  << ore::NV("Callee", &F)
532  << " inline cost-savings smaller than "
533  << ore::NV("Cost", MinOutlineRegionCost);
534  });
535 
536  LLVM_DEBUG(dbgs() << "ABORT: Outline region cost is smaller than "
537  << MinOutlineRegionCost << "\n";);
538  continue;
539  }
540 
541  // For now, ignore blocks that belong to a SISE region that is a
542  // candidate for outlining. In the future, we may want to look
543  // at inner regions because the outer region may have live-exit
544  // variables.
545  for (auto *BB : DominateVector)
546  VisitedMap[BB] = true;
547 
548  // ReturnBlock here means the block after the outline call
549  BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
550  FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
551  DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
552  OutliningInfo->ORI.push_back(RegInfo);
553  LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: "
554  << DominateVector.front()->getName() << "\n";);
555  ColdCandidateFound = true;
556  NumColdRegionsFound++;
557  }
558  }
559 
560  if (ColdCandidateFound)
561  return OutliningInfo;
562 
563  return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
564 }
565 
566 std::unique_ptr<FunctionOutliningInfo>
567 PartialInlinerImpl::computeOutliningInfo(Function &F) const {
568  BasicBlock *EntryBlock = &F.front();
569  BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
570  if (!BR || BR->isUnconditional())
571  return std::unique_ptr<FunctionOutliningInfo>();
572 
573  // Returns true if Succ is BB's successor
574  auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
575  return is_contained(successors(BB), Succ);
576  };
577 
578  auto IsReturnBlock = [](BasicBlock *BB) {
579  Instruction *TI = BB->getTerminator();
580  return isa<ReturnInst>(TI);
581  };
582 
583  auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
584  if (IsReturnBlock(Succ1))
585  return std::make_tuple(Succ1, Succ2);
586  if (IsReturnBlock(Succ2))
587  return std::make_tuple(Succ2, Succ1);
588 
589  return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
590  };
591 
592  // Detect a triangular shape:
593  auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
594  if (IsSuccessor(Succ1, Succ2))
595  return std::make_tuple(Succ1, Succ2);
596  if (IsSuccessor(Succ2, Succ1))
597  return std::make_tuple(Succ2, Succ1);
598 
599  return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
600  };
601 
602  std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
603  std::make_unique<FunctionOutliningInfo>();
604 
605  BasicBlock *CurrEntry = EntryBlock;
606  bool CandidateFound = false;
607  do {
608  // The number of blocks to be inlined has already reached
609  // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
610  // disables partial inlining for the function.
611  if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks)
612  break;
613 
614  if (succ_size(CurrEntry) != 2)
615  break;
616 
617  BasicBlock *Succ1 = *succ_begin(CurrEntry);
618  BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
619 
620  BasicBlock *ReturnBlock, *NonReturnBlock;
621  std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
622 
623  if (ReturnBlock) {
624  OutliningInfo->Entries.push_back(CurrEntry);
625  OutliningInfo->ReturnBlock = ReturnBlock;
626  OutliningInfo->NonReturnBlock = NonReturnBlock;
627  CandidateFound = true;
628  break;
629  }
630 
631  BasicBlock *CommSucc, *OtherSucc;
632  std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
633 
634  if (!CommSucc)
635  break;
636 
637  OutliningInfo->Entries.push_back(CurrEntry);
638  CurrEntry = OtherSucc;
639  } while (true);
640 
641  if (!CandidateFound)
642  return std::unique_ptr<FunctionOutliningInfo>();
643 
644  // There should not be any successors (not in the entry set) other than
645  // {ReturnBlock, NonReturnBlock}
646  assert(OutliningInfo->Entries[0] == &F.front() &&
647  "Function Entry must be the first in Entries vector");
648  DenseSet<BasicBlock *> Entries;
649  for (BasicBlock *E : OutliningInfo->Entries)
650  Entries.insert(E);
651 
652  // Returns true of BB has Predecessor which is not
653  // in Entries set.
654  auto HasNonEntryPred = [Entries](BasicBlock *BB) {
655  for (auto *Pred : predecessors(BB)) {
656  if (!Entries.count(Pred))
657  return true;
658  }
659  return false;
660  };
661  auto CheckAndNormalizeCandidate =
662  [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
663  for (BasicBlock *E : OutliningInfo->Entries) {
664  for (auto *Succ : successors(E)) {
665  if (Entries.count(Succ))
666  continue;
667  if (Succ == OutliningInfo->ReturnBlock)
668  OutliningInfo->ReturnBlockPreds.push_back(E);
669  else if (Succ != OutliningInfo->NonReturnBlock)
670  return false;
671  }
672  // There should not be any outside incoming edges either:
673  if (HasNonEntryPred(E))
674  return false;
675  }
676  return true;
677  };
678 
679  if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
680  return std::unique_ptr<FunctionOutliningInfo>();
681 
682  // Now further growing the candidate's inlining region by
683  // peeling off dominating blocks from the outlining region:
684  while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) {
685  BasicBlock *Cand = OutliningInfo->NonReturnBlock;
686  if (succ_size(Cand) != 2)
687  break;
688 
689  if (HasNonEntryPred(Cand))
690  break;
691 
692  BasicBlock *Succ1 = *succ_begin(Cand);
693  BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
694 
695  BasicBlock *ReturnBlock, *NonReturnBlock;
696  std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
697  if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
698  break;
699 
700  if (NonReturnBlock->getSinglePredecessor() != Cand)
701  break;
702 
703  // Now grow and update OutlininigInfo:
704  OutliningInfo->Entries.push_back(Cand);
705  OutliningInfo->NonReturnBlock = NonReturnBlock;
706  OutliningInfo->ReturnBlockPreds.push_back(Cand);
707  Entries.insert(Cand);
708  }
709 
710  return OutliningInfo;
711 }
712 
713 // Check if there is PGO data or user annotated branch data:
714 static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) {
715  if (F.hasProfileData())
716  return true;
717  // Now check if any of the entry block has MD_prof data:
718  for (auto *E : OI.Entries) {
719  BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
720  if (!BR || BR->isUnconditional())
721  continue;
722  uint64_t T, F;
723  if (BR->extractProfMetadata(T, F))
724  return true;
725  }
726  return false;
727 }
728 
729 BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
730  FunctionCloner &Cloner) const {
731  BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
732  auto EntryFreq =
733  Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
734  auto OutliningCallFreq =
735  Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
736  // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
737  // we outlined any regions, so we may encounter situations where the
738  // OutliningCallFreq is *slightly* bigger than the EntryFreq.
739  if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency())
740  OutliningCallFreq = EntryFreq;
741 
742  auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
743  OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
744 
745  if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI.get()))
746  return OutlineRegionRelFreq;
747 
748  // When profile data is not available, we need to be conservative in
749  // estimating the overall savings. Static branch prediction can usually
750  // guess the branch direction right (taken/non-taken), but the guessed
751  // branch probability is usually not biased enough. In case when the
752  // outlined region is predicted to be likely, its probability needs
753  // to be made higher (more biased) to not under-estimate the cost of
754  // function outlining. On the other hand, if the outlined region
755  // is predicted to be less likely, the predicted probablity is usually
756  // higher than the actual. For instance, the actual probability of the
757  // less likely target is only 5%, but the guessed probablity can be
758  // 40%. In the latter case, there is no need for further adjustement.
759  // FIXME: add an option for this.
760  if (OutlineRegionRelFreq < BranchProbability(45, 100))
761  return OutlineRegionRelFreq;
762 
763  OutlineRegionRelFreq = std::max(
764  OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
765 
766  return OutlineRegionRelFreq;
767 }
768 
769 bool PartialInlinerImpl::shouldPartialInline(
770  CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
771  OptimizationRemarkEmitter &ORE) const {
772  using namespace ore;
773 
775  assert(Callee == Cloner.ClonedFunc);
776 
777  if (SkipCostAnalysis)
778  return isInlineViable(*Callee).isSuccess();
779 
780  Function *Caller = CB.getCaller();
781  auto &CalleeTTI = GetTTI(*Callee);
782  bool RemarksEnabled =
783  Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
784  DEBUG_TYPE);
785  InlineCost IC =
786  getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache,
787  GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr);
788 
789  if (IC.isAlways()) {
790  ORE.emit([&]() {
791  return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB)
792  << NV("Callee", Cloner.OrigFunc)
793  << " should always be fully inlined, not partially";
794  });
795  return false;
796  }
797 
798  if (IC.isNever()) {
799  ORE.emit([&]() {
800  return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB)
801  << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
802  << NV("Caller", Caller)
803  << " because it should never be inlined (cost=never)";
804  });
805  return false;
806  }
807 
808  if (!IC) {
809  ORE.emit([&]() {
810  return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB)
811  << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
812  << NV("Caller", Caller) << " because too costly to inline (cost="
813  << NV("Cost", IC.getCost()) << ", threshold="
814  << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
815  });
816  return false;
817  }
818  const DataLayout &DL = Caller->getParent()->getDataLayout();
819 
820  // The savings of eliminating the call:
821  int NonWeightedSavings = getCallsiteCost(CB, DL);
822  BlockFrequency NormWeightedSavings(NonWeightedSavings);
823 
824  // Weighted saving is smaller than weighted cost, return false
825  if (NormWeightedSavings < WeightedOutliningRcost) {
826  ORE.emit([&]() {
827  return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
828  &CB)
829  << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
830  << NV("Caller", Caller) << " runtime overhead (overhead="
831  << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
832  << ", savings="
833  << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
834  << ")"
835  << " of making the outlined call is too high";
836  });
837 
838  return false;
839  }
840 
841  ORE.emit([&]() {
842  return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB)
843  << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
844  << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
845  << " (threshold="
846  << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
847  });
848  return true;
849 }
850 
851 // TODO: Ideally we should share Inliner's InlineCost Analysis code.
852 // For now use a simplified version. The returned 'InlineCost' will be used
853 // to esimate the size cost as well as runtime cost of the BB.
855 PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
858  const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
859  for (Instruction &I : BB->instructionsWithoutDebug()) {
860  // Skip free instructions.
861  switch (I.getOpcode()) {
862  case Instruction::BitCast:
863  case Instruction::PtrToInt:
864  case Instruction::IntToPtr:
865  case Instruction::Alloca:
866  case Instruction::PHI:
867  continue;
868  case Instruction::GetElementPtr:
869  if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
870  continue;
871  break;
872  default:
873  break;
874  }
875 
876  if (I.isLifetimeStartOrEnd())
877  continue;
878 
879  if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
880  Intrinsic::ID IID = II->getIntrinsicID();
882  FastMathFlags FMF;
883  for (Value *Val : II->args())
884  Tys.push_back(Val->getType());
885 
886  if (auto *FPMO = dyn_cast<FPMathOperator>(II))
887  FMF = FPMO->getFastMathFlags();
888 
889  IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF);
891  continue;
892  }
893 
894  if (CallInst *CI = dyn_cast<CallInst>(&I)) {
895  InlineCost += getCallsiteCost(*CI, DL);
896  continue;
897  }
898 
899  if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
900  InlineCost += getCallsiteCost(*II, DL);
901  continue;
902  }
903 
904  if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
905  InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
906  continue;
907  }
909  }
910 
911  return InlineCost;
912 }
913 
914 std::tuple<InstructionCost, InstructionCost>
915 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const {
916  InstructionCost OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
917  for (auto FuncBBPair : Cloner.OutlinedFunctions) {
918  Function *OutlinedFunc = FuncBBPair.first;
919  BasicBlock* OutliningCallBB = FuncBBPair.second;
920  // Now compute the cost of the call sequence to the outlined function
921  // 'OutlinedFunction' in BB 'OutliningCallBB':
922  auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc);
923  OutliningFuncCallCost +=
924  computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI);
925 
926  // Now compute the cost of the extracted/outlined function itself:
927  for (BasicBlock &BB : *OutlinedFunc)
928  OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI);
929  }
930  assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
931  "Outlined function cost should be no less than the outlined region");
932 
933  // The code extractor introduces a new root and exit stub blocks with
934  // additional unconditional branches. Those branches will be eliminated
935  // later with bb layout. The cost should be adjusted accordingly:
936  OutlinedFunctionCost -=
937  2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size();
938 
939  InstructionCost OutliningRuntimeOverhead =
940  OutliningFuncCallCost +
941  (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
942  ExtraOutliningPenalty.getValue();
943 
944  return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
945 }
946 
947 // Create the callsite to profile count map which is
948 // used to update the original function's entry count,
949 // after the function is partially inlined into the callsite.
950 void PartialInlinerImpl::computeCallsiteToProfCountMap(
951  Function *DuplicateFunction,
952  DenseMap<User *, uint64_t> &CallSiteToProfCountMap) const {
953  std::vector<User *> Users(DuplicateFunction->user_begin(),
954  DuplicateFunction->user_end());
955  Function *CurrentCaller = nullptr;
956  std::unique_ptr<BlockFrequencyInfo> TempBFI;
957  BlockFrequencyInfo *CurrentCallerBFI = nullptr;
958 
959  auto ComputeCurrBFI = [&,this](Function *Caller) {
960  // For the old pass manager:
961  if (!GetBFI) {
962  DominatorTree DT(*Caller);
963  LoopInfo LI(DT);
964  BranchProbabilityInfo BPI(*Caller, LI);
965  TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
966  CurrentCallerBFI = TempBFI.get();
967  } else {
968  // New pass manager:
969  CurrentCallerBFI = &(GetBFI(*Caller));
970  }
971  };
972 
973  for (User *User : Users) {
974  CallBase *CB = getSupportedCallBase(User);
975  Function *Caller = CB->getCaller();
976  if (CurrentCaller != Caller) {
977  CurrentCaller = Caller;
978  ComputeCurrBFI(Caller);
979  } else {
980  assert(CurrentCallerBFI && "CallerBFI is not set");
981  }
982  BasicBlock *CallBB = CB->getParent();
983  auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
984  if (Count)
985  CallSiteToProfCountMap[User] = *Count;
986  else
987  CallSiteToProfCountMap[User] = 0;
988  }
989 }
990 
991 PartialInlinerImpl::FunctionCloner::FunctionCloner(
992  Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
993  function_ref<AssumptionCache *(Function &)> LookupAC,
995  : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
996  ClonedOI = std::make_unique<FunctionOutliningInfo>();
997 
998  // Clone the function, so that we can hack away on it.
999  ValueToValueMapTy VMap;
1000  ClonedFunc = CloneFunction(F, VMap);
1001 
1002  ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
1003  ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
1004  for (BasicBlock *BB : OI->Entries)
1005  ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
1006 
1007  for (BasicBlock *E : OI->ReturnBlockPreds) {
1008  BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
1009  ClonedOI->ReturnBlockPreds.push_back(NewE);
1010  }
1011  // Go ahead and update all uses to the duplicate, so that we can just
1012  // use the inliner functionality when we're done hacking.
1013  F->replaceAllUsesWith(ClonedFunc);
1014 }
1015 
1016 PartialInlinerImpl::FunctionCloner::FunctionCloner(
1017  Function *F, FunctionOutliningMultiRegionInfo *OI,
1019  function_ref<AssumptionCache *(Function &)> LookupAC,
1021  : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
1022  ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
1023 
1024  // Clone the function, so that we can hack away on it.
1025  ValueToValueMapTy VMap;
1026  ClonedFunc = CloneFunction(F, VMap);
1027 
1028  // Go through all Outline Candidate Regions and update all BasicBlock
1029  // information.
1030  for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1031  OI->ORI) {
1033  for (BasicBlock *BB : RegionInfo.Region)
1034  Region.push_back(cast<BasicBlock>(VMap[BB]));
1035 
1036  BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
1037  BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
1038  BasicBlock *NewReturnBlock = nullptr;
1039  if (RegionInfo.ReturnBlock)
1040  NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
1041  FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
1042  Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
1043  ClonedOMRI->ORI.push_back(MappedRegionInfo);
1044  }
1045  // Go ahead and update all uses to the duplicate, so that we can just
1046  // use the inliner functionality when we're done hacking.
1047  F->replaceAllUsesWith(ClonedFunc);
1048 }
1049 
1050 void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const {
1051  auto GetFirstPHI = [](BasicBlock *BB) {
1052  BasicBlock::iterator I = BB->begin();
1053  PHINode *FirstPhi = nullptr;
1054  while (I != BB->end()) {
1055  PHINode *Phi = dyn_cast<PHINode>(I);
1056  if (!Phi)
1057  break;
1058  if (!FirstPhi) {
1059  FirstPhi = Phi;
1060  break;
1061  }
1062  }
1063  return FirstPhi;
1064  };
1065 
1066  // Shouldn't need to normalize PHIs if we're not outlining non-early return
1067  // blocks.
1068  if (!ClonedOI)
1069  return;
1070 
1071  // Special hackery is needed with PHI nodes that have inputs from more than
1072  // one extracted block. For simplicity, just split the PHIs into a two-level
1073  // sequence of PHIs, some of which will go in the extracted region, and some
1074  // of which will go outside.
1075  BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1076  // only split block when necessary:
1077  PHINode *FirstPhi = GetFirstPHI(PreReturn);
1078  unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1079 
1080  if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1081  return;
1082 
1083  auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1084  Value *CommonValue = PN->getIncomingValue(0);
1085  if (all_of(PN->incoming_values(),
1086  [&](Value *V) { return V == CommonValue; }))
1087  return CommonValue;
1088  return nullptr;
1089  };
1090 
1091  ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1092  ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
1093  BasicBlock::iterator I = PreReturn->begin();
1094  Instruction *Ins = &ClonedOI->ReturnBlock->front();
1096  while (I != PreReturn->end()) {
1097  PHINode *OldPhi = dyn_cast<PHINode>(I);
1098  if (!OldPhi)
1099  break;
1100 
1101  PHINode *RetPhi =
1102  PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
1103  OldPhi->replaceAllUsesWith(RetPhi);
1104  Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
1105 
1106  RetPhi->addIncoming(&*I, PreReturn);
1107  for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1108  RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
1109  OldPhi->removeIncomingValue(E);
1110  }
1111 
1112  // After incoming values splitting, the old phi may become trivial.
1113  // Keeping the trivial phi can introduce definition inside the outline
1114  // region which is live-out, causing necessary overhead (load, store
1115  // arg passing etc).
1116  if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1117  OldPhi->replaceAllUsesWith(OldPhiVal);
1118  DeadPhis.push_back(OldPhi);
1119  }
1120  ++I;
1121  }
1122  for (auto *DP : DeadPhis)
1123  DP->eraseFromParent();
1124 
1125  for (auto *E : ClonedOI->ReturnBlockPreds)
1126  E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1127 }
1128 
1129 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1130 
1131  auto ComputeRegionCost =
1133  InstructionCost Cost = 0;
1134  for (BasicBlock* BB : Region)
1135  Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
1136  return Cost;
1137  };
1138 
1139  assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1140 
1141  if (ClonedOMRI->ORI.empty())
1142  return false;
1143 
1144  // The CodeExtractor needs a dominator tree.
1145  DominatorTree DT;
1146  DT.recalculate(*ClonedFunc);
1147 
1148  // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1149  LoopInfo LI(DT);
1150  BranchProbabilityInfo BPI(*ClonedFunc, LI);
1151  ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1152 
1153  // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
1154  CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1155 
1156  SetVector<Value *> Inputs, Outputs, Sinks;
1157  for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1158  ClonedOMRI->ORI) {
1159  InstructionCost CurrentOutlinedRegionCost =
1160  ComputeRegionCost(RegionInfo.Region);
1161 
1162  CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
1163  ClonedFuncBFI.get(), &BPI,
1164  LookupAC(*RegionInfo.EntryBlock->getParent()),
1165  /* AllowVarargs */ false);
1166 
1167  CE.findInputsOutputs(Inputs, Outputs, Sinks);
1168 
1169  LLVM_DEBUG({
1170  dbgs() << "inputs: " << Inputs.size() << "\n";
1171  dbgs() << "outputs: " << Outputs.size() << "\n";
1172  for (Value *value : Inputs)
1173  dbgs() << "value used in func: " << *value << "\n";
1174  for (Value *output : Outputs)
1175  dbgs() << "instr used in func: " << *output << "\n";
1176  });
1177 
1178  // Do not extract regions that have live exit variables.
1179  if (Outputs.size() > 0 && !ForceLiveExit)
1180  continue;
1181 
1182  if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) {
1183  CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc);
1184  BasicBlock *OutliningCallBB = OCS->getParent();
1185  assert(OutliningCallBB->getParent() == ClonedFunc);
1186  OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1187  NumColdRegionsOutlined++;
1188  OutlinedRegionCost += CurrentOutlinedRegionCost;
1189 
1190  if (MarkOutlinedColdCC) {
1191  OutlinedFunc->setCallingConv(CallingConv::Cold);
1193  }
1194  } else
1195  ORE.emit([&]() {
1196  return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1197  &RegionInfo.Region.front()->front())
1198  << "Failed to extract region at block "
1199  << ore::NV("Block", RegionInfo.Region.front());
1200  });
1201  }
1202 
1203  return !OutlinedFunctions.empty();
1204 }
1205 
1206 Function *
1207 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1208  // Returns true if the block is to be partial inlined into the caller
1209  // (i.e. not to be extracted to the out of line function)
1210  auto ToBeInlined = [&, this](BasicBlock *BB) {
1211  return BB == ClonedOI->ReturnBlock ||
1212  llvm::is_contained(ClonedOI->Entries, BB);
1213  };
1214 
1215  assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1216  // The CodeExtractor needs a dominator tree.
1217  DominatorTree DT;
1218  DT.recalculate(*ClonedFunc);
1219 
1220  // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1221  LoopInfo LI(DT);
1222  BranchProbabilityInfo BPI(*ClonedFunc, LI);
1223  ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1224 
1225  // Gather up the blocks that we're going to extract.
1226  std::vector<BasicBlock *> ToExtract;
1227  auto *ClonedFuncTTI = &GetTTI(*ClonedFunc);
1228  ToExtract.push_back(ClonedOI->NonReturnBlock);
1229  OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost(
1230  ClonedOI->NonReturnBlock, ClonedFuncTTI);
1231  for (BasicBlock &BB : *ClonedFunc)
1232  if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
1233  ToExtract.push_back(&BB);
1234  // FIXME: the code extractor may hoist/sink more code
1235  // into the outlined function which may make the outlining
1236  // overhead (the difference of the outlined function cost
1237  // and OutliningRegionCost) look larger.
1238  OutlinedRegionCost += computeBBInlineCost(&BB, ClonedFuncTTI);
1239  }
1240 
1241  // Extract the body of the if.
1242  CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1243  Function *OutlinedFunc =
1244  CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
1245  ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
1246  /* AllowVarargs */ true)
1247  .extractCodeRegion(CEAC);
1248 
1249  if (OutlinedFunc) {
1250  BasicBlock *OutliningCallBB =
1251  PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->getParent();
1252  assert(OutliningCallBB->getParent() == ClonedFunc);
1253  OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1254  } else
1255  ORE.emit([&]() {
1256  return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1257  &ToExtract.front()->front())
1258  << "Failed to extract region at block "
1259  << ore::NV("Block", ToExtract.front());
1260  });
1261 
1262  return OutlinedFunc;
1263 }
1264 
1265 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1266  // Ditch the duplicate, since we're done with it, and rewrite all remaining
1267  // users (function pointers, etc.) back to the original function.
1268  ClonedFunc->replaceAllUsesWith(OrigFunc);
1269  ClonedFunc->eraseFromParent();
1270  if (!IsFunctionInlined) {
1271  // Remove each function that was speculatively created if there is no
1272  // reference.
1273  for (auto FuncBBPair : OutlinedFunctions) {
1274  Function *Func = FuncBBPair.first;
1275  Func->eraseFromParent();
1276  }
1277  }
1278 }
1279 
1280 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &F) {
1281  if (F.hasAddressTaken())
1282  return {false, nullptr};
1283 
1284  // Let inliner handle it
1285  if (F.hasFnAttribute(Attribute::AlwaysInline))
1286  return {false, nullptr};
1287 
1288  if (F.hasFnAttribute(Attribute::NoInline))
1289  return {false, nullptr};
1290 
1291  if (PSI.isFunctionEntryCold(&F))
1292  return {false, nullptr};
1293 
1294  if (F.users().empty())
1295  return {false, nullptr};
1296 
1298 
1299  // Only try to outline cold regions if we have a profile summary, which
1300  // implies we have profiling information.
1301  if (PSI.hasProfileSummary() && F.hasProfileData() &&
1303  std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1304  computeOutliningColdRegionsInfo(F, ORE);
1305  if (OMRI) {
1306  FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI);
1307 
1308  LLVM_DEBUG({
1309  dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n";
1310  dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold()
1311  << "\n";
1312  });
1313 
1314  bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1315 
1316  if (DidOutline) {
1317  LLVM_DEBUG({
1318  dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1319  Cloner.ClonedFunc->print(dbgs());
1320  dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1321  });
1322 
1323  if (tryPartialInline(Cloner))
1324  return {true, nullptr};
1325  }
1326  }
1327  }
1328 
1329  // Fall-thru to regular partial inlining if we:
1330  // i) can't find any cold regions to outline, or
1331  // ii) can't inline the outlined function anywhere.
1332  std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1333  if (!OI)
1334  return {false, nullptr};
1335 
1336  FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI);
1337  Cloner.normalizeReturnBlock();
1338 
1339  Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1340 
1341  if (!OutlinedFunction)
1342  return {false, nullptr};
1343 
1344  if (tryPartialInline(Cloner))
1345  return {true, OutlinedFunction};
1346 
1347  return {false, nullptr};
1348 }
1349 
1350 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1351  if (Cloner.OutlinedFunctions.empty())
1352  return false;
1353 
1354  int SizeCost = 0;
1355  BlockFrequency WeightedRcost;
1356  int NonWeightedRcost;
1357 
1358  auto OutliningCosts = computeOutliningCosts(Cloner);
1359  assert(std::get<0>(OutliningCosts).isValid() &&
1360  std::get<1>(OutliningCosts).isValid() && "Expected valid costs");
1361 
1362  SizeCost = *std::get<0>(OutliningCosts).getValue();
1363  NonWeightedRcost = *std::get<1>(OutliningCosts).getValue();
1364 
1365  // Only calculate RelativeToEntryFreq when we are doing single region
1366  // outlining.
1367  BranchProbability RelativeToEntryFreq;
1368  if (Cloner.ClonedOI)
1369  RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1370  else
1371  // RelativeToEntryFreq doesn't make sense when we have more than one
1372  // outlined call because each call will have a different relative frequency
1373  // to the entry block. We can consider using the average, but the
1374  // usefulness of that information is questionable. For now, assume we never
1375  // execute the calls to outlined functions.
1376  RelativeToEntryFreq = BranchProbability(0, 1);
1377 
1378  WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
1379 
1380  // The call sequence(s) to the outlined function(s) are larger than the sum of
1381  // the original outlined region size(s), it does not increase the chances of
1382  // inlining the function with outlining (The inliner uses the size increase to
1383  // model the cost of inlining a callee).
1384  if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1385  OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1386  DebugLoc DLoc;
1387  BasicBlock *Block;
1388  std::tie(DLoc, Block) = getOneDebugLoc(*Cloner.ClonedFunc);
1389  OrigFuncORE.emit([&]() {
1390  return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
1391  DLoc, Block)
1392  << ore::NV("Function", Cloner.OrigFunc)
1393  << " not partially inlined into callers (Original Size = "
1394  << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1395  << ", Size of call sequence to outlined function = "
1396  << ore::NV("NewSize", SizeCost) << ")";
1397  });
1398  return false;
1399  }
1400 
1401  assert(Cloner.OrigFunc->users().empty() &&
1402  "F's users should all be replaced!");
1403 
1404  std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1405  Cloner.ClonedFunc->user_end());
1406 
1407  DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1408  auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1409  if (CalleeEntryCount)
1410  computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1411 
1412  uint64_t CalleeEntryCountV =
1413  (CalleeEntryCount ? CalleeEntryCount->getCount() : 0);
1414 
1415  bool AnyInline = false;
1416  for (User *User : Users) {
1417  CallBase *CB = getSupportedCallBase(User);
1418 
1419  if (isLimitReached())
1420  continue;
1421 
1422  OptimizationRemarkEmitter CallerORE(CB->getCaller());
1423  if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE))
1424  continue;
1425 
1426  // Construct remark before doing the inlining, as after successful inlining
1427  // the callsite is removed.
1428  OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB);
1429  OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
1430  << ore::NV("Caller", CB->getCaller());
1431 
1432  InlineFunctionInfo IFI(nullptr, GetAssumptionCache, &PSI);
1433  // We can only forward varargs when we outlined a single region, else we
1434  // bail on vararg functions.
1435  if (!InlineFunction(*CB, IFI, nullptr, true,
1436  (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1437  : nullptr))
1438  .isSuccess())
1439  continue;
1440 
1441  CallerORE.emit(OR);
1442 
1443  // Now update the entry count:
1444  if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
1445  uint64_t CallSiteCount = CallSiteToProfCountMap[User];
1446  CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1447  }
1448 
1449  AnyInline = true;
1450  NumPartialInlining++;
1451  // Update the stats
1452  if (Cloner.ClonedOI)
1453  NumPartialInlined++;
1454  else
1455  NumColdOutlinePartialInlined++;
1456  }
1457 
1458  if (AnyInline) {
1459  Cloner.IsFunctionInlined = true;
1460  if (CalleeEntryCount)
1461  Cloner.OrigFunc->setEntryCount(Function::ProfileCount(
1462  CalleeEntryCountV, CalleeEntryCount->getType()));
1463  OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1464  OrigFuncORE.emit([&]() {
1465  return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1466  << "Partially inlined into at least one caller";
1467  });
1468  }
1469 
1470  return AnyInline;
1471 }
1472 
1473 bool PartialInlinerImpl::run(Module &M) {
1475  return false;
1476 
1477  std::vector<Function *> Worklist;
1478  Worklist.reserve(M.size());
1479  for (Function &F : M)
1480  if (!F.use_empty() && !F.isDeclaration())
1481  Worklist.push_back(&F);
1482 
1483  bool Changed = false;
1484  while (!Worklist.empty()) {
1485  Function *CurrFunc = Worklist.back();
1486  Worklist.pop_back();
1487 
1488  if (CurrFunc->use_empty())
1489  continue;
1490 
1491  bool Recursive = false;
1492  for (User *U : CurrFunc->users())
1493  if (Instruction *I = dyn_cast<Instruction>(U))
1494  if (I->getParent()->getParent() == CurrFunc) {
1495  Recursive = true;
1496  break;
1497  }
1498  if (Recursive)
1499  continue;
1500 
1501  std::pair<bool, Function *> Result = unswitchFunction(*CurrFunc);
1502  if (Result.second)
1503  Worklist.push_back(Result.second);
1504  Changed |= Result.first;
1505  }
1506 
1507  return Changed;
1508 }
1509 
1511 
1512 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
1513  "Partial Inliner", false, false)
1518 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
1519  "Partial Inliner", false, false)
1520 
1522  return new PartialInlinerLegacyPass();
1523 }
1524 
1525 PreservedAnalyses PartialInlinerPass::run(Module &M,
1526  ModuleAnalysisManager &AM) {
1527  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1528 
1529  auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & {
1530  return FAM.getResult<AssumptionAnalysis>(F);
1531  };
1532 
1533  auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
1535  };
1536 
1537  auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
1539  };
1540 
1541  auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
1542  return FAM.getResult<TargetIRAnalysis>(F);
1543  };
1544 
1545  auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
1547  };
1548 
1550 
1551  if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
1552  GetTLI, PSI, GetBFI)
1553  .run(M))
1554  return PreservedAnalyses::none();
1555  return PreservedAnalyses::all();
1556 }
llvm::InstructionCost
Definition: InstructionCost.h:29
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:155
llvm::InlineCost::isAlways
bool isAlways() const
Definition: InlineCost.h:124
SkipCostAnalysis
static cl::opt< bool > SkipCostAnalysis("skip-partial-inlining-cost-analysis", cl::init(false), cl::ZeroOrMore, cl::ReallyHidden, cl::desc("Skip Cost Analysis"))
llvm::InlineCost::getCost
int getCost() const
Get the inline cost estimate.
Definition: InlineCost.h:130
llvm::BasicBlock::end
iterator end()
Definition: BasicBlock.h:298
llvm::TargetIRAnalysis
Analysis pass providing the TargetTransformInfo.
Definition: TargetTransformInfo.h:2372
llvm::OptimizationRemarkMissed
Diagnostic information for missed-optimization remarks.
Definition: DiagnosticInfo.h:730
llvm::predecessors
pred_range predecessors(BasicBlock *BB)
Definition: CFG.h:127
OutlineRegionFreqPercent
static cl::opt< int > OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), cl::Hidden, cl::ZeroOrMore, cl::desc("Relative frequency of outline region to " "the entry block"))
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AllocatorList.h:23
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
Optional.h
ValueMapper.h
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:633
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:90
llvm::ModulePass
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition: Pass.h:238
llvm::InstructionCost::map
auto map(const Function &F) const -> InstructionCost
Definition: InstructionCost.h:245
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
IntrinsicInst.h
llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:783
PartialInlining.h
llvm::succ_end
Interval::succ_iterator succ_end(Interval *I)
Definition: Interval.h:102
T
llvm::Function
Definition: Function.h:62
DEBUG_TYPE
#define DEBUG_TYPE
Definition: PartialInlining.cpp:66
Pass.h
llvm::CodeExtractor::extractCodeRegion
Function * extractCodeRegion(const CodeExtractorAnalysisCache &CEAC)
Perform the extraction, returning the new function.
Definition: CodeExtractor.cpp:1589
llvm::SetVector::size
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:77
llvm::TargetTransformInfoWrapperPass::getTTI
TargetTransformInfo & getTTI(const Function &F)
Definition: TargetTransformInfo.cpp:1182
llvm::ProfileSummaryInfo::hasProfileSummary
bool hasProfileSummary() const
Returns true if profile summary is available.
Definition: ProfileSummaryInfo.h:68
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
Statistic.h
output
Current output
Definition: README.txt:1350
ErrorHandling.h
llvm::TargetTransformInfo
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Definition: TargetTransformInfo.h:169
llvm::TargetTransformInfo::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
Definition: TargetTransformInfo.cpp:886
llvm::PHINode::removeIncomingValue
Value * removeIncomingValue(unsigned Idx, bool DeletePHIIfEmpty=true)
Remove an incoming value.
Definition: Instructions.cpp:113
OptimizationRemarkEmitter.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:151
FAM
FunctionAnalysisManager FAM
Definition: PassBuilderBindings.cpp:59
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
MinBlockCounterExecution
static cl::opt< unsigned > MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, cl::desc("Minimum block executions to consider " "its BranchProbabilityInfo valid"))
llvm::PreservedAnalyses::none
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: PassManager.h:158
BlockFrequency.h
DenseMap.h
Module.h
llvm::sys::path::end
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:235
llvm::sys::path::begin
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
llvm::BasicBlock::getSingleSuccessor
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:298
llvm::ProfileSummaryInfo::isFunctionEntryCold
bool isFunctionEntryCold(const Function *F) const
Returns true if F has cold function entry.
Definition: ProfileSummaryInfo.cpp:222
llvm::isInlineViable
InlineResult isInlineViable(Function &Callee)
Minimal filter to detect invalid constructs for inlining.
Definition: InlineCost.cpp:2960
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::count
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:145
llvm::BranchProbability::getBranchProbability
static BranchProbability getBranchProbability(uint64_t Numerator, uint64_t Denominator)
Definition: BranchProbability.cpp:53
llvm::ore::NV
DiagnosticInfoOptimizationBase::Argument NV
Definition: OptimizationRemarkEmitter.h:136
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:397
CodeExtractor.h
llvm::BasicBlock::getSinglePredecessor
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:268
STLExtras.h
llvm::InlineCost::isNever
bool isNever() const
Definition: InlineCost.h:125
llvm::successors
succ_range successors(Instruction *I)
Definition: CFG.h:262
llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition: Operator.h:165
llvm::cl::ReallyHidden
@ ReallyHidden
Definition: CommandLine.h:144
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
F
#define F(x, y, z)
Definition: MD5.cpp:56
ColdBranchRatio
static cl::opt< float > ColdBranchRatio("cold-branch-ratio", cl::init(0.1), cl::Hidden, cl::desc("Minimum BranchProbability to consider a region cold."))
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
Instruction.h
CommandLine.h
llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1600
llvm::BlockFrequencyInfo
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Definition: BlockFrequencyInfo.h:37
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:31
llvm::getInlineCost
InlineCost getInlineCost(CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< const TargetLibraryInfo &(Function &)> GetTLI, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get an InlineCost object representing the cost of inlining this callsite.
Definition: InlineCost.cpp:2788
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::User
Definition: User.h:44
llvm::succ_size
unsigned succ_size(const Instruction *I)
Definition: CFG.h:259
Intrinsics.h
InstrTypes.h
llvm::CallBase::getCalledFunction
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation.
Definition: InstrTypes.h:1398
llvm::BasicBlock::begin
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:296
llvm::BranchProbabilityInfo
Analysis providing branch probability information.
Definition: BranchProbabilityInfo.h:115
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::InlineCost
Represents the cost of inlining a function.
Definition: InlineCost.h:82
llvm::ms_demangle::QualifierMangleMode::Result
@ Result
TargetLibraryInfo.h
DenseSet.h
false
Definition: StackSlotColoring.cpp:142
llvm::AssumptionCacheTracker::lookupAssumptionCache
AssumptionCache * lookupAssumptionCache(Function &F)
Return the cached assumptions for a function if it has already been scanned.
Definition: AssumptionCache.cpp:307
MaxNumPartialInlining
static cl::opt< int > MaxNumPartialInlining("max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore, cl::desc("Max number of partial inlining. The default is unlimited"))
llvm::PHINode::getIncomingValueForBlock
Value * getIncomingValueForBlock(const BasicBlock *BB) const
Definition: Instructions.h:2833
llvm::pdb::PDB_SymType::Caller
@ Caller
llvm::Instruction
Definition: Instruction.h:45
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
llvm::InlineResult::isSuccess
bool isSuccess() const
Definition: InlineCost.h:168
DebugLoc.h
llvm::getInlineParams
InlineParams getInlineParams()
Generate the parameters to tune the inline cost analysis based only on the commandline options.
Definition: InlineCost.cpp:3072
llvm::InlineConstants::InstrCost
const int InstrCost
Definition: InlineCost.h:45
llvm::BlockFrequencyAnalysis
Analysis pass which computes BlockFrequencyInfo.
Definition: BlockFrequencyInfo.h:112
llvm::PHINode::getNumIncomingValues
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Definition: Instructions.h:2740
MinRegionSizeRatio
static cl::opt< float > MinRegionSizeRatio("min-region-size-ratio", cl::init(0.1), cl::Hidden, cl::desc("Minimum ratio comparing relative sizes of each " "outline candidate and original function"))
llvm::Value::use_empty
bool use_empty() const
Definition: Value.h:344
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:119
BranchProbability.h
llvm::CallBase::getCaller
Function * getCaller()
Helper to get the caller (the parent function).
Definition: Instructions.cpp:282
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
CFG.h
LoopInfo.h
llvm::ProfileSummaryInfo
Analysis providing profile information.
Definition: ProfileSummaryInfo.h:39
llvm::InvokeInst
Invoke instruction.
Definition: Instructions.h:3760
llvm::cl::ZeroOrMore
@ ZeroOrMore
Definition: CommandLine.h:120
llvm::function_ref
An efficient, type-erasing, non-owning reference to a callable.
Definition: STLExtras.h:168
llvm::Value::user_end
user_iterator user_end()
Definition: Value.h:405
llvm::DenseSet
Implements a dense probed hash-table based set.
Definition: DenseSet.h:268
BasicBlock.h
llvm::cl::opt< bool >
llvm::BlockFrequency
Definition: BlockFrequency.h:24
BranchProbabilityInfo.h
llvm::TargetLibraryInfoWrapperPass
Definition: TargetLibraryInfo.h:465
uint64_t
ProfileSummaryInfo.h
llvm::TargetTransformInfoWrapperPass
Wrapper pass for TargetTransformInfo.
Definition: TargetTransformInfo.h:2428
llvm::AssumptionAnalysis
A function analysis which provides an AssumptionCache.
Definition: AssumptionCache.h:173
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
IPO.h
llvm::PHINode::addIncoming
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Definition: Instructions.h:2798
llvm::DenseMap
Definition: DenseMap.h:714
I
#define I(x, y, z)
Definition: MD5.cpp:59
Cloning.h
llvm::succ_begin
Interval::succ_iterator succ_begin(Interval *I)
succ_begin/succ_end - define methods so that Intervals may be used just like BasicBlocks can with the...
Definition: Interval.h:99
llvm::BlockFrequency::getFrequency
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
Definition: BlockFrequency.h:35
MarkOutlinedColdCC
static cl::opt< bool > MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden, cl::desc("Mark outline function calls with ColdCC"))
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:441
llvm::ProfileSummaryInfo::getColdCountThreshold
uint64_t getColdCountThreshold() const
Returns ColdCountThreshold if set.
Definition: ProfileSummaryInfo.h:176
llvm::ProfileSummaryInfoWrapperPass
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Definition: ProfileSummaryInfo.h:193
llvm::InlineCost::getCostDelta
int getCostDelta() const
Get the cost delta from the threshold for inlining.
Definition: InlineCost.h:154
llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1665
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::OptimizationRemarkEmitter::emit
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Definition: OptimizationRemarkEmitter.cpp:77
InlineCost.h
llvm::ProfileSummaryInfo::hasInstrumentationProfile
bool hasInstrumentationProfile() const
Returns true if module M has instrumentation profile.
Definition: ProfileSummaryInfo.h:77
llvm::WinEH::EncodingType::CE
@ CE
Windows NT (Windows on ARM)
inliner
partial inliner
Definition: PartialInlining.cpp:1518
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
OtherSucc
static MachineBasicBlock * OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ)
Definition: ARMISelLowering.cpp:10955
DisablePartialInlining
static cl::opt< bool > DisablePartialInlining("disable-partial-inlining", cl::init(false), cl::Hidden, cl::desc("Disable partial inlining"))
hasProfileData
static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI)
Definition: PartialInlining.cpp:714
llvm::ProfileSummaryAnalysis
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Definition: ProfileSummaryInfo.h:211
llvm::initializePartialInlinerLegacyPassPass
void initializePartialInlinerLegacyPassPass(PassRegistry &)
MaxNumInlineBlocks
static cl::opt< unsigned > MaxNumInlineBlocks("max-num-inline-blocks", cl::init(5), cl::Hidden, cl::desc("Max number of blocks to be partially inlined"))
llvm::DominatorTreeBase::recalculate
void recalculate(ParentType &Func)
recalculate - compute a dominator tree for the given function
Definition: GenericDomTree.h:778
llvm::AssumptionCacheTracker
An immutable pass that tracks lazily created AssumptionCache objects.
Definition: AssumptionCache.h:202
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::LoopInfo
Definition: LoopInfo.h:1083
None.h
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::Function::setCallingConv
void setCallingConv(CallingConv::ID CC)
Definition: Function.h:244
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:42
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:216
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:134
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:532
llvm::BranchProbability
Definition: BranchProbability.h:30
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::CodeExtractor
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:85
llvm::RegionInfo
Definition: RegionInfo.h:900
llvm::CallingConv::Cold
@ Cold
Definition: CallingConv.h:48
llvm::CloneFunction
Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
Definition: CloneFunction.cpp:283
BlockFrequencyInfo.h
llvm::AMDGPUISD::BFI
@ BFI
Definition: AMDGPUISelLowering.h:431
llvm::OptimizationRemarkAnalysis
Diagnostic information for optimization analysis remarks.
Definition: DiagnosticInfo.h:776
llvm::ifs::IFSSymbolType::Func
@ Func
llvm::ValueMap< const Value *, WeakTrackingVH >
llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:152
Callee
amdgpu Simplify well known AMD library false FunctionCallee Callee
Definition: AMDGPULibCalls.cpp:206
ForceLiveExit
static cl::opt< bool > ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden, cl::desc("Force outline regions with live exits"))
Attributes.h
llvm::AssumptionCacheTracker::getAssumptionCache
AssumptionCache & getAssumptionCache(Function &F)
Get the cached assumptions for a function.
Definition: AssumptionCache.cpp:287
llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:161
llvm::ISD::BR
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:937
llvm::Function::back
const BasicBlock & back() const
Definition: Function.h:732
llvm::AnalysisManager::getCachedResult
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:802
Casting.h
DiagnosticInfo.h
Function.h
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:221
llvm::InlineFunctionInfo
This class captures the data input to the InlineFunction call, and records the auxiliary results prod...
Definition: Cloning.h:201
DP
So we should use XX3Form_Rcr to implement instrinsic Convert DP outs ins xscvdpsp No builtin are required Round &Convert QP DP(dword[1] is set to zero) No builtin are required Round to Quad Precision because you need to assign rounding mode in instruction Provide builtin(set f128:$vT,(int_ppc_vsx_xsrqpi f128:$vB))(set f128 yields< n x< ty > >< result > yields< ty >< result > No builtin are required Load Store load store see def memrix16 in PPCInstrInfo td Load Store Vector load store outs ins lxsdx set load store with conversion from to DP
Definition: README_P9.txt:520
Inliner
partial Partial Inliner
Definition: PartialInlining.cpp:1519
llvm::CodeExtractorAnalysisCache
A cache for the CodeExtractor analysis.
Definition: CodeExtractor.h:46
llvm::ProfileSummaryInfo::isColdBlock
bool isColdBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI) const
Returns true if BasicBlock BB is considered cold.
Definition: ProfileSummaryInfo.cpp:332
DisableMultiRegionPartialInline
static cl::opt< bool > DisableMultiRegionPartialInline("disable-mr-partial-inlining", cl::init(false), cl::Hidden, cl::desc("Disable multi-region partial inlining"))
llvm::OptimizationRemark
Diagnostic information for applied optimization remarks.
Definition: DiagnosticInfo.h:685
isValid
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
Definition: RustDemangle.cpp:217
Instructions.h
llvm::getCallsiteCost
int getCallsiteCost(CallBase &Call, const DataLayout &DL)
Return the cost associated with a callsite, including parameter passing and the call/return instructi...
Definition: InlineCost.cpp:2755
SmallVector.h
llvm::Instruction::getDebugLoc
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:370
User.h
llvm::BlockFrequencyInfo::getBlockProfileCount
Optional< uint64_t > getBlockProfileCount(const BasicBlock *BB, bool AllowSynthetic=false) const
Returns the estimated profile count of BB.
Definition: BlockFrequencyInfo.cpp:209
Dominators.h
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:94
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:340
Users
iv Induction Variable Users
Definition: IVUsers.cpp:52
TargetTransformInfo.h
llvm::PHINode
Definition: Instructions.h:2648
llvm::ProfileSummaryInfo::getHotCountThreshold
uint64_t getHotCountThreshold() const
Returns HotCountThreshold if set.
Definition: ProfileSummaryInfo.h:172
llvm::MipsISD::Ins
@ Ins
Definition: MipsISelLowering.h:157
llvm::SmallVectorImpl< BasicBlock * >
llvm::InlineFunction
InlineResult InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, AAResults *CalleeAAR=nullptr, bool InsertLifetime=true, Function *ForwardVarArgsTo=nullptr)
This function inlines the called function into the basic block of the caller.
Definition: InlineFunction.cpp:1751
llvm::CallBase
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1176
llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: InstructionSimplify.h:44
llvm::createPartialInliningPass
ModulePass * createPartialInliningPass()
createPartialInliningPass - This pass inlines parts of functions.
Definition: PartialInlining.cpp:1521
llvm::InnerAnalysisManagerProxy
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:940
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1469
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", "Partial Inliner", false, false) INITIALIZE_PASS_END(PartialInlinerLegacyPass
ExtraOutliningPenalty
static cl::opt< unsigned > ExtraOutliningPenalty("partial-inlining-extra-penalty", cl::init(0), cl::Hidden, cl::desc("A debug option to add additional penalty to the computed one."))
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::SwitchInst
Multiway switch.
Definition: Instructions.h:3227
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::Function::ProfileCount
Class to represent profile counts.
Definition: Function.h:255
llvm::cl::desc
Definition: CommandLine.h:412
llvm::Region
Definition: RegionInfo.h:889
llvm::BranchInst
Conditional or Unconditional Branch instruction.
Definition: Instructions.h:3083
llvm::SetVector< Value * >
llvm::pdb::PDB_SymType::Block
@ Block
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::TargetLibraryAnalysis
Analysis pass providing the TargetLibraryInfo.
Definition: TargetLibraryInfo.h:440
llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:421
llvm::CallBase::setCallingConv
void setCallingConv(CallingConv::ID CC)
Definition: InstrTypes.h:1458
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38