LLVM  15.0.0git
PartialInlining.cpp
Go to the documentation of this file.
1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass performs partial inlining, typically by inlining an if statement
10 // that surrounds the body of the function.
11 //
12 //===----------------------------------------------------------------------===//
13 
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/DenseSet.h"
17 #include "llvm/ADT/Optional.h"
18 #include "llvm/ADT/STLExtras.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/Statistic.h"
24 #include "llvm/Analysis/LoopInfo.h"
29 #include "llvm/IR/Attributes.h"
30 #include "llvm/IR/BasicBlock.h"
31 #include "llvm/IR/CFG.h"
32 #include "llvm/IR/DebugLoc.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Dominators.h"
35 #include "llvm/IR/Function.h"
36 #include "llvm/IR/InstrTypes.h"
37 #include "llvm/IR/Instruction.h"
38 #include "llvm/IR/Instructions.h"
39 #include "llvm/IR/IntrinsicInst.h"
40 #include "llvm/IR/Intrinsics.h"
41 #include "llvm/IR/Module.h"
42 #include "llvm/IR/Operator.h"
43 #include "llvm/IR/User.h"
44 #include "llvm/InitializePasses.h"
45 #include "llvm/Pass.h"
48 #include "llvm/Support/Casting.h"
51 #include "llvm/Transforms/IPO.h"
55 #include <algorithm>
56 #include <cassert>
57 #include <cstdint>
58 #include <memory>
59 #include <tuple>
60 #include <vector>
61 
62 using namespace llvm;
63 
64 #define DEBUG_TYPE "partial-inlining"
65 
66 STATISTIC(NumPartialInlined,
67  "Number of callsites functions partially inlined into.");
68 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
69  "cold outlined regions were partially "
70  "inlined into its caller(s).");
71 STATISTIC(NumColdRegionsFound,
72  "Number of cold single entry/exit regions found.");
73 STATISTIC(NumColdRegionsOutlined,
74  "Number of cold single entry/exit regions outlined.");
75 
76 // Command line option to disable partial-inlining. The default is false:
77 static cl::opt<bool>
78  DisablePartialInlining("disable-partial-inlining", cl::init(false),
79  cl::Hidden, cl::desc("Disable partial inlining"));
80 // Command line option to disable multi-region partial-inlining. The default is
81 // false:
83  "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
84  cl::desc("Disable multi-region partial inlining"));
85 
86 // Command line option to force outlining in regions with live exit variables.
87 // The default is false:
88 static cl::opt<bool>
89  ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
90  cl::desc("Force outline regions with live exits"));
91 
92 // Command line option to enable marking outline functions with Cold Calling
93 // Convention. The default is false:
94 static cl::opt<bool>
95  MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
96  cl::desc("Mark outline function calls with ColdCC"));
97 
98 // This is an option used by testing:
99 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
100 
102  cl::desc("Skip Cost Analysis"));
103 // Used to determine if a cold region is worth outlining based on
104 // its inlining cost compared to the original function. Default is set at 10%.
105 // ie. if the cold region reduces the inlining cost of the original function by
106 // at least 10%.
108  "min-region-size-ratio", cl::init(0.1), cl::Hidden,
109  cl::desc("Minimum ratio comparing relative sizes of each "
110  "outline candidate and original function"));
111 // Used to tune the minimum number of execution counts needed in the predecessor
112 // block to the cold edge. ie. confidence interval.
113 static cl::opt<unsigned>
114  MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
115  cl::desc("Minimum block executions to consider "
116  "its BranchProbabilityInfo valid"));
117 // Used to determine when an edge is considered cold. Default is set to 10%. ie.
118 // if the branch probability is 10% or less, then it is deemed as 'cold'.
120  "cold-branch-ratio", cl::init(0.1), cl::Hidden,
121  cl::desc("Minimum BranchProbability to consider a region cold."));
122 
124  "max-num-inline-blocks", cl::init(5), cl::Hidden,
125  cl::desc("Max number of blocks to be partially inlined"));
126 
127 // Command line option to set the maximum number of partial inlining allowed
128 // for the module. The default value of -1 means no limit.
130  "max-partial-inlining", cl::init(-1), cl::Hidden,
131  cl::desc("Max number of partial inlining. The default is unlimited"));
132 
133 // Used only when PGO or user annotated branch data is absent. It is
134 // the least value that is used to weigh the outline region. If BFI
135 // produces larger value, the BFI value will be used.
136 static cl::opt<int>
137  OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
138  cl::Hidden,
139  cl::desc("Relative frequency of outline region to "
140  "the entry block"));
141 
143  "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
144  cl::desc("A debug option to add additional penalty to the computed one."));
145 
146 namespace {
147 
148 struct FunctionOutliningInfo {
149  FunctionOutliningInfo() = default;
150 
151  // Returns the number of blocks to be inlined including all blocks
152  // in Entries and one return block.
153  unsigned getNumInlinedBlocks() const { return Entries.size() + 1; }
154 
155  // A set of blocks including the function entry that guard
156  // the region to be outlined.
158 
159  // The return block that is not included in the outlined region.
160  BasicBlock *ReturnBlock = nullptr;
161 
162  // The dominating block of the region to be outlined.
163  BasicBlock *NonReturnBlock = nullptr;
164 
165  // The set of blocks in Entries that that are predecessors to ReturnBlock
166  SmallVector<BasicBlock *, 4> ReturnBlockPreds;
167 };
168 
169 struct FunctionOutliningMultiRegionInfo {
170  FunctionOutliningMultiRegionInfo() = default;
171 
172  // Container for outline regions
173  struct OutlineRegionInfo {
174  OutlineRegionInfo(ArrayRef<BasicBlock *> Region,
175  BasicBlock *EntryBlock, BasicBlock *ExitBlock,
176  BasicBlock *ReturnBlock)
177  : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock),
178  ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {}
180  BasicBlock *EntryBlock;
181  BasicBlock *ExitBlock;
182  BasicBlock *ReturnBlock;
183  };
184 
186 };
187 
188 struct PartialInlinerImpl {
189 
190  PartialInlinerImpl(
192  function_ref<AssumptionCache *(Function &)> LookupAC,
194  function_ref<const TargetLibraryInfo &(Function &)> GTLI,
195  ProfileSummaryInfo &ProfSI,
196  function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr)
197  : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
198  GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}
199 
200  bool run(Module &M);
201  // Main part of the transformation that calls helper functions to find
202  // outlining candidates, clone & outline the function, and attempt to
203  // partially inline the resulting function. Returns true if
204  // inlining was successful, false otherwise. Also returns the outline
205  // function (only if we partially inlined early returns) as there is a
206  // possibility to further "peel" early return statements that were left in the
207  // outline function due to code size.
208  std::pair<bool, Function *> unswitchFunction(Function &F);
209 
210  // This class speculatively clones the function to be partial inlined.
211  // At the end of partial inlining, the remaining callsites to the cloned
212  // function that are not partially inlined will be fixed up to reference
213  // the original function, and the cloned function will be erased.
214  struct FunctionCloner {
215  // Two constructors, one for single region outlining, the other for
216  // multi-region outlining.
217  FunctionCloner(Function *F, FunctionOutliningInfo *OI,
219  function_ref<AssumptionCache *(Function &)> LookupAC,
221  FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
223  function_ref<AssumptionCache *(Function &)> LookupAC,
225 
226  ~FunctionCloner();
227 
228  // Prepare for function outlining: making sure there is only
229  // one incoming edge from the extracted/outlined region to
230  // the return block.
231  void normalizeReturnBlock() const;
232 
233  // Do function outlining for cold regions.
234  bool doMultiRegionFunctionOutlining();
235  // Do function outlining for region after early return block(s).
236  // NOTE: For vararg functions that do the vararg handling in the outlined
237  // function, we temporarily generate IR that does not properly
238  // forward varargs to the outlined function. Calling InlineFunction
239  // will update calls to the outlined functions to properly forward
240  // the varargs.
241  Function *doSingleRegionFunctionOutlining();
242 
243  Function *OrigFunc = nullptr;
244  Function *ClonedFunc = nullptr;
245 
246  typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
247  // Keep track of Outlined Functions and the basic block they're called from.
248  SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
249 
250  // ClonedFunc is inlined in one of its callers after function
251  // outlining.
252  bool IsFunctionInlined = false;
253  // The cost of the region to be outlined.
254  InstructionCost OutlinedRegionCost = 0;
255  // ClonedOI is specific to outlining non-early return blocks.
256  std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
257  // ClonedOMRI is specific to outlining cold regions.
258  std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
259  std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
261  function_ref<AssumptionCache *(Function &)> LookupAC;
263  };
264 
265 private:
266  int NumPartialInlining = 0;
267  function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
268  function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
271  function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
272  ProfileSummaryInfo &PSI;
273 
274  // Return the frequency of the OutlininingBB relative to F's entry point.
275  // The result is no larger than 1 and is represented using BP.
276  // (Note that the outlined region's 'head' block can only have incoming
277  // edges from the guarding entry blocks).
279  getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const;
280 
281  // Return true if the callee of CB should be partially inlined with
282  // profit.
283  bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
284  BlockFrequency WeightedOutliningRcost,
285  OptimizationRemarkEmitter &ORE) const;
286 
287  // Try to inline DuplicateFunction (cloned from F with call to
288  // the OutlinedFunction into its callers. Return true
289  // if there is any successful inlining.
290  bool tryPartialInline(FunctionCloner &Cloner);
291 
292  // Compute the mapping from use site of DuplicationFunction to the enclosing
293  // BB's profile count.
294  void
295  computeCallsiteToProfCountMap(Function *DuplicateFunction,
296  DenseMap<User *, uint64_t> &SiteCountMap) const;
297 
298  bool isLimitReached() const {
299  return (MaxNumPartialInlining != -1 &&
300  NumPartialInlining >= MaxNumPartialInlining);
301  }
302 
303  static CallBase *getSupportedCallBase(User *U) {
304  if (isa<CallInst>(U) || isa<InvokeInst>(U))
305  return cast<CallBase>(U);
306  llvm_unreachable("All uses must be calls");
307  return nullptr;
308  }
309 
310  static CallBase *getOneCallSiteTo(Function &F) {
311  User *User = *F.user_begin();
312  return getSupportedCallBase(User);
313  }
314 
315  std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &F) const {
316  CallBase *CB = getOneCallSiteTo(F);
317  DebugLoc DLoc = CB->getDebugLoc();
318  BasicBlock *Block = CB->getParent();
319  return std::make_tuple(DLoc, Block);
320  }
321 
322  // Returns the costs associated with function outlining:
323  // - The first value is the non-weighted runtime cost for making the call
324  // to the outlined function, including the addtional setup cost in the
325  // outlined function itself;
326  // - The second value is the estimated size of the new call sequence in
327  // basic block Cloner.OutliningCallBB;
328  std::tuple<InstructionCost, InstructionCost>
329  computeOutliningCosts(FunctionCloner &Cloner) const;
330 
331  // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
332  // approximate both the size and runtime cost (Note that in the current
333  // inline cost analysis, there is no clear distinction there either).
334  static InstructionCost computeBBInlineCost(BasicBlock *BB,
336 
337  std::unique_ptr<FunctionOutliningInfo>
338  computeOutliningInfo(Function &F) const;
339 
340  std::unique_ptr<FunctionOutliningMultiRegionInfo>
341  computeOutliningColdRegionsInfo(Function &F,
342  OptimizationRemarkEmitter &ORE) const;
343 };
344 
345 struct PartialInlinerLegacyPass : public ModulePass {
346  static char ID; // Pass identification, replacement for typeid
347 
348  PartialInlinerLegacyPass() : ModulePass(ID) {
350  }
351 
352  void getAnalysisUsage(AnalysisUsage &AU) const override {
357  }
358 
359  bool runOnModule(Module &M) override {
360  if (skipModule(M))
361  return false;
362 
363  AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
365  &getAnalysis<TargetTransformInfoWrapperPass>();
366  ProfileSummaryInfo &PSI =
367  getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
368 
369  auto GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & {
370  return ACT->getAssumptionCache(F);
371  };
372 
373  auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * {
374  return ACT->lookupAssumptionCache(F);
375  };
376 
377  auto GetTTI = [&TTIWP](Function &F) -> TargetTransformInfo & {
378  return TTIWP->getTTI(F);
379  };
380 
381  auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
382  return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
383  };
384 
385  return PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
386  GetTLI, PSI)
387  .run(M);
388  }
389 };
390 
391 } // end anonymous namespace
392 
393 std::unique_ptr<FunctionOutliningMultiRegionInfo>
394 PartialInlinerImpl::computeOutliningColdRegionsInfo(
395  Function &F, OptimizationRemarkEmitter &ORE) const {
396  BasicBlock *EntryBlock = &F.front();
397 
398  DominatorTree DT(F);
399  LoopInfo LI(DT);
400  BranchProbabilityInfo BPI(F, LI);
401  std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
403  if (!GetBFI) {
404  ScopedBFI.reset(new BlockFrequencyInfo(F, BPI, LI));
405  BFI = ScopedBFI.get();
406  } else
407  BFI = &(GetBFI(F));
408 
409  // Return if we don't have profiling information.
410  if (!PSI.hasInstrumentationProfile())
411  return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
412 
413  std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
414  std::make_unique<FunctionOutliningMultiRegionInfo>();
415 
416  auto IsSingleExit =
417  [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
418  BasicBlock *ExitBlock = nullptr;
419  for (auto *Block : BlockList) {
420  for (BasicBlock *Succ : successors(Block)) {
421  if (!is_contained(BlockList, Succ)) {
422  if (ExitBlock) {
423  ORE.emit([&]() {
424  return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
425  &Succ->front())
426  << "Region dominated by "
427  << ore::NV("Block", BlockList.front()->getName())
428  << " has more than one region exit edge.";
429  });
430  return nullptr;
431  }
432 
433  ExitBlock = Block;
434  }
435  }
436  }
437  return ExitBlock;
438  };
439 
440  auto BBProfileCount = [BFI](BasicBlock *BB) {
441  return BFI->getBlockProfileCount(BB).value_or(0);
442  };
443 
444  // Use the same computeBBInlineCost function to compute the cost savings of
445  // the outlining the candidate region.
446  TargetTransformInfo *FTTI = &GetTTI(F);
447  InstructionCost OverallFunctionCost = 0;
448  for (auto &BB : F)
449  OverallFunctionCost += computeBBInlineCost(&BB, FTTI);
450 
451  LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost
452  << "\n";);
453 
454  InstructionCost MinOutlineRegionCost = OverallFunctionCost.map(
455  [&](auto Cost) { return Cost * MinRegionSizeRatio; });
456 
457  BranchProbability MinBranchProbability(
458  static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
460  bool ColdCandidateFound = false;
461  BasicBlock *CurrEntry = EntryBlock;
462  std::vector<BasicBlock *> DFS;
463  DenseMap<BasicBlock *, bool> VisitedMap;
464  DFS.push_back(CurrEntry);
465  VisitedMap[CurrEntry] = true;
466 
467  // Use Depth First Search on the basic blocks to find CFG edges that are
468  // considered cold.
469  // Cold regions considered must also have its inline cost compared to the
470  // overall inline cost of the original function. The region is outlined only
471  // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
472  // more.
473  while (!DFS.empty()) {
474  auto *ThisBB = DFS.back();
475  DFS.pop_back();
476  // Only consider regions with predecessor blocks that are considered
477  // not-cold (default: part of the top 99.99% of all block counters)
478  // AND greater than our minimum block execution count (default: 100).
479  if (PSI.isColdBlock(ThisBB, BFI) ||
480  BBProfileCount(ThisBB) < MinBlockCounterExecution)
481  continue;
482  for (auto SI = succ_begin(ThisBB); SI != succ_end(ThisBB); ++SI) {
483  if (VisitedMap[*SI])
484  continue;
485  VisitedMap[*SI] = true;
486  DFS.push_back(*SI);
487  // If branch isn't cold, we skip to the next one.
488  BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI);
489  if (SuccProb > MinBranchProbability)
490  continue;
491 
492  LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->"
493  << SI->getName()
494  << "\nBranch Probability = " << SuccProb << "\n";);
495 
496  SmallVector<BasicBlock *, 8> DominateVector;
497  DT.getDescendants(*SI, DominateVector);
498  assert(!DominateVector.empty() &&
499  "SI should be reachable and have at least itself as descendant");
500 
501  // We can only outline single entry regions (for now).
502  if (!DominateVector.front()->hasNPredecessors(1)) {
503  LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
504  << " doesn't have a single predecessor in the "
505  "dominator tree\n";);
506  continue;
507  }
508 
509  BasicBlock *ExitBlock = nullptr;
510  // We can only outline single exit regions (for now).
511  if (!(ExitBlock = IsSingleExit(DominateVector))) {
512  LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
513  << " doesn't have a unique successor\n";);
514  continue;
515  }
516 
517  InstructionCost OutlineRegionCost = 0;
518  for (auto *BB : DominateVector)
519  OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
520 
521  LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost
522  << "\n";);
523 
524  if (!SkipCostAnalysis && OutlineRegionCost < MinOutlineRegionCost) {
525  ORE.emit([&]() {
526  return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
527  &SI->front())
528  << ore::NV("Callee", &F)
529  << " inline cost-savings smaller than "
530  << ore::NV("Cost", MinOutlineRegionCost);
531  });
532 
533  LLVM_DEBUG(dbgs() << "ABORT: Outline region cost is smaller than "
534  << MinOutlineRegionCost << "\n";);
535  continue;
536  }
537 
538  // For now, ignore blocks that belong to a SISE region that is a
539  // candidate for outlining. In the future, we may want to look
540  // at inner regions because the outer region may have live-exit
541  // variables.
542  for (auto *BB : DominateVector)
543  VisitedMap[BB] = true;
544 
545  // ReturnBlock here means the block after the outline call
546  BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
547  FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
548  DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
549  OutliningInfo->ORI.push_back(RegInfo);
550  LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: "
551  << DominateVector.front()->getName() << "\n";);
552  ColdCandidateFound = true;
553  NumColdRegionsFound++;
554  }
555  }
556 
557  if (ColdCandidateFound)
558  return OutliningInfo;
559 
560  return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
561 }
562 
563 std::unique_ptr<FunctionOutliningInfo>
564 PartialInlinerImpl::computeOutliningInfo(Function &F) const {
565  BasicBlock *EntryBlock = &F.front();
566  BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
567  if (!BR || BR->isUnconditional())
568  return std::unique_ptr<FunctionOutliningInfo>();
569 
570  // Returns true if Succ is BB's successor
571  auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
572  return is_contained(successors(BB), Succ);
573  };
574 
575  auto IsReturnBlock = [](BasicBlock *BB) {
576  Instruction *TI = BB->getTerminator();
577  return isa<ReturnInst>(TI);
578  };
579 
580  auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
581  if (IsReturnBlock(Succ1))
582  return std::make_tuple(Succ1, Succ2);
583  if (IsReturnBlock(Succ2))
584  return std::make_tuple(Succ2, Succ1);
585 
586  return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
587  };
588 
589  // Detect a triangular shape:
590  auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
591  if (IsSuccessor(Succ1, Succ2))
592  return std::make_tuple(Succ1, Succ2);
593  if (IsSuccessor(Succ2, Succ1))
594  return std::make_tuple(Succ2, Succ1);
595 
596  return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
597  };
598 
599  std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
600  std::make_unique<FunctionOutliningInfo>();
601 
602  BasicBlock *CurrEntry = EntryBlock;
603  bool CandidateFound = false;
604  do {
605  // The number of blocks to be inlined has already reached
606  // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
607  // disables partial inlining for the function.
608  if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks)
609  break;
610 
611  if (succ_size(CurrEntry) != 2)
612  break;
613 
614  BasicBlock *Succ1 = *succ_begin(CurrEntry);
615  BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
616 
617  BasicBlock *ReturnBlock, *NonReturnBlock;
618  std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
619 
620  if (ReturnBlock) {
621  OutliningInfo->Entries.push_back(CurrEntry);
622  OutliningInfo->ReturnBlock = ReturnBlock;
623  OutliningInfo->NonReturnBlock = NonReturnBlock;
624  CandidateFound = true;
625  break;
626  }
627 
628  BasicBlock *CommSucc, *OtherSucc;
629  std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
630 
631  if (!CommSucc)
632  break;
633 
634  OutliningInfo->Entries.push_back(CurrEntry);
635  CurrEntry = OtherSucc;
636  } while (true);
637 
638  if (!CandidateFound)
639  return std::unique_ptr<FunctionOutliningInfo>();
640 
641  // There should not be any successors (not in the entry set) other than
642  // {ReturnBlock, NonReturnBlock}
643  assert(OutliningInfo->Entries[0] == &F.front() &&
644  "Function Entry must be the first in Entries vector");
645  DenseSet<BasicBlock *> Entries;
646  for (BasicBlock *E : OutliningInfo->Entries)
647  Entries.insert(E);
648 
649  // Returns true of BB has Predecessor which is not
650  // in Entries set.
651  auto HasNonEntryPred = [Entries](BasicBlock *BB) {
652  for (auto *Pred : predecessors(BB)) {
653  if (!Entries.count(Pred))
654  return true;
655  }
656  return false;
657  };
658  auto CheckAndNormalizeCandidate =
659  [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
660  for (BasicBlock *E : OutliningInfo->Entries) {
661  for (auto *Succ : successors(E)) {
662  if (Entries.count(Succ))
663  continue;
664  if (Succ == OutliningInfo->ReturnBlock)
665  OutliningInfo->ReturnBlockPreds.push_back(E);
666  else if (Succ != OutliningInfo->NonReturnBlock)
667  return false;
668  }
669  // There should not be any outside incoming edges either:
670  if (HasNonEntryPred(E))
671  return false;
672  }
673  return true;
674  };
675 
676  if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
677  return std::unique_ptr<FunctionOutliningInfo>();
678 
679  // Now further growing the candidate's inlining region by
680  // peeling off dominating blocks from the outlining region:
681  while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) {
682  BasicBlock *Cand = OutliningInfo->NonReturnBlock;
683  if (succ_size(Cand) != 2)
684  break;
685 
686  if (HasNonEntryPred(Cand))
687  break;
688 
689  BasicBlock *Succ1 = *succ_begin(Cand);
690  BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
691 
692  BasicBlock *ReturnBlock, *NonReturnBlock;
693  std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
694  if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
695  break;
696 
697  if (NonReturnBlock->getSinglePredecessor() != Cand)
698  break;
699 
700  // Now grow and update OutlininigInfo:
701  OutliningInfo->Entries.push_back(Cand);
702  OutliningInfo->NonReturnBlock = NonReturnBlock;
703  OutliningInfo->ReturnBlockPreds.push_back(Cand);
704  Entries.insert(Cand);
705  }
706 
707  return OutliningInfo;
708 }
709 
710 // Check if there is PGO data or user annotated branch data:
711 static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) {
712  if (F.hasProfileData())
713  return true;
714  // Now check if any of the entry block has MD_prof data:
715  for (auto *E : OI.Entries) {
716  BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
717  if (!BR || BR->isUnconditional())
718  continue;
719  uint64_t T, F;
720  if (BR->extractProfMetadata(T, F))
721  return true;
722  }
723  return false;
724 }
725 
726 BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
727  FunctionCloner &Cloner) const {
728  BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
729  auto EntryFreq =
730  Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
731  auto OutliningCallFreq =
732  Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
733  // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
734  // we outlined any regions, so we may encounter situations where the
735  // OutliningCallFreq is *slightly* bigger than the EntryFreq.
736  if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency())
737  OutliningCallFreq = EntryFreq;
738 
739  auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
740  OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
741 
742  if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI))
743  return OutlineRegionRelFreq;
744 
745  // When profile data is not available, we need to be conservative in
746  // estimating the overall savings. Static branch prediction can usually
747  // guess the branch direction right (taken/non-taken), but the guessed
748  // branch probability is usually not biased enough. In case when the
749  // outlined region is predicted to be likely, its probability needs
750  // to be made higher (more biased) to not under-estimate the cost of
751  // function outlining. On the other hand, if the outlined region
752  // is predicted to be less likely, the predicted probablity is usually
753  // higher than the actual. For instance, the actual probability of the
754  // less likely target is only 5%, but the guessed probablity can be
755  // 40%. In the latter case, there is no need for further adjustement.
756  // FIXME: add an option for this.
757  if (OutlineRegionRelFreq < BranchProbability(45, 100))
758  return OutlineRegionRelFreq;
759 
760  OutlineRegionRelFreq = std::max(
761  OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
762 
763  return OutlineRegionRelFreq;
764 }
765 
766 bool PartialInlinerImpl::shouldPartialInline(
767  CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
768  OptimizationRemarkEmitter &ORE) const {
769  using namespace ore;
770 
772  assert(Callee == Cloner.ClonedFunc);
773 
774  if (SkipCostAnalysis)
775  return isInlineViable(*Callee).isSuccess();
776 
777  Function *Caller = CB.getCaller();
778  auto &CalleeTTI = GetTTI(*Callee);
779  bool RemarksEnabled =
780  Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
781  DEBUG_TYPE);
782  InlineCost IC =
783  getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache,
784  GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr);
785 
786  if (IC.isAlways()) {
787  ORE.emit([&]() {
788  return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB)
789  << NV("Callee", Cloner.OrigFunc)
790  << " should always be fully inlined, not partially";
791  });
792  return false;
793  }
794 
795  if (IC.isNever()) {
796  ORE.emit([&]() {
797  return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB)
798  << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
799  << NV("Caller", Caller)
800  << " because it should never be inlined (cost=never)";
801  });
802  return false;
803  }
804 
805  if (!IC) {
806  ORE.emit([&]() {
807  return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB)
808  << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
809  << NV("Caller", Caller) << " because too costly to inline (cost="
810  << NV("Cost", IC.getCost()) << ", threshold="
811  << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
812  });
813  return false;
814  }
815  const DataLayout &DL = Caller->getParent()->getDataLayout();
816 
817  // The savings of eliminating the call:
818  int NonWeightedSavings = getCallsiteCost(CB, DL);
819  BlockFrequency NormWeightedSavings(NonWeightedSavings);
820 
821  // Weighted saving is smaller than weighted cost, return false
822  if (NormWeightedSavings < WeightedOutliningRcost) {
823  ORE.emit([&]() {
824  return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
825  &CB)
826  << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
827  << NV("Caller", Caller) << " runtime overhead (overhead="
828  << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
829  << ", savings="
830  << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
831  << ")"
832  << " of making the outlined call is too high";
833  });
834 
835  return false;
836  }
837 
838  ORE.emit([&]() {
839  return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB)
840  << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
841  << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
842  << " (threshold="
843  << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
844  });
845  return true;
846 }
847 
848 // TODO: Ideally we should share Inliner's InlineCost Analysis code.
849 // For now use a simplified version. The returned 'InlineCost' will be used
850 // to esimate the size cost as well as runtime cost of the BB.
852 PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
855  const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
856  for (Instruction &I : BB->instructionsWithoutDebug()) {
857  // Skip free instructions.
858  switch (I.getOpcode()) {
859  case Instruction::BitCast:
860  case Instruction::PtrToInt:
861  case Instruction::IntToPtr:
862  case Instruction::Alloca:
863  case Instruction::PHI:
864  continue;
865  case Instruction::GetElementPtr:
866  if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
867  continue;
868  break;
869  default:
870  break;
871  }
872 
873  if (I.isLifetimeStartOrEnd())
874  continue;
875 
876  if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
877  Intrinsic::ID IID = II->getIntrinsicID();
879  FastMathFlags FMF;
880  for (Value *Val : II->args())
881  Tys.push_back(Val->getType());
882 
883  if (auto *FPMO = dyn_cast<FPMathOperator>(II))
884  FMF = FPMO->getFastMathFlags();
885 
886  IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF);
888  continue;
889  }
890 
891  if (CallInst *CI = dyn_cast<CallInst>(&I)) {
892  InlineCost += getCallsiteCost(*CI, DL);
893  continue;
894  }
895 
896  if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
897  InlineCost += getCallsiteCost(*II, DL);
898  continue;
899  }
900 
901  if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
902  InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
903  continue;
904  }
906  }
907 
908  return InlineCost;
909 }
910 
911 std::tuple<InstructionCost, InstructionCost>
912 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const {
913  InstructionCost OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
914  for (auto FuncBBPair : Cloner.OutlinedFunctions) {
915  Function *OutlinedFunc = FuncBBPair.first;
916  BasicBlock* OutliningCallBB = FuncBBPair.second;
917  // Now compute the cost of the call sequence to the outlined function
918  // 'OutlinedFunction' in BB 'OutliningCallBB':
919  auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc);
920  OutliningFuncCallCost +=
921  computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI);
922 
923  // Now compute the cost of the extracted/outlined function itself:
924  for (BasicBlock &BB : *OutlinedFunc)
925  OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI);
926  }
927  assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
928  "Outlined function cost should be no less than the outlined region");
929 
930  // The code extractor introduces a new root and exit stub blocks with
931  // additional unconditional branches. Those branches will be eliminated
932  // later with bb layout. The cost should be adjusted accordingly:
933  OutlinedFunctionCost -=
934  2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size();
935 
936  InstructionCost OutliningRuntimeOverhead =
937  OutliningFuncCallCost +
938  (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
939  ExtraOutliningPenalty.getValue();
940 
941  return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
942 }
943 
944 // Create the callsite to profile count map which is
945 // used to update the original function's entry count,
946 // after the function is partially inlined into the callsite.
947 void PartialInlinerImpl::computeCallsiteToProfCountMap(
948  Function *DuplicateFunction,
949  DenseMap<User *, uint64_t> &CallSiteToProfCountMap) const {
950  std::vector<User *> Users(DuplicateFunction->user_begin(),
951  DuplicateFunction->user_end());
952  Function *CurrentCaller = nullptr;
953  std::unique_ptr<BlockFrequencyInfo> TempBFI;
954  BlockFrequencyInfo *CurrentCallerBFI = nullptr;
955 
956  auto ComputeCurrBFI = [&,this](Function *Caller) {
957  // For the old pass manager:
958  if (!GetBFI) {
959  DominatorTree DT(*Caller);
960  LoopInfo LI(DT);
961  BranchProbabilityInfo BPI(*Caller, LI);
962  TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
963  CurrentCallerBFI = TempBFI.get();
964  } else {
965  // New pass manager:
966  CurrentCallerBFI = &(GetBFI(*Caller));
967  }
968  };
969 
970  for (User *User : Users) {
971  // Don't bother with BlockAddress used by CallBr for asm goto.
972  if (isa<BlockAddress>(User))
973  continue;
974  CallBase *CB = getSupportedCallBase(User);
975  Function *Caller = CB->getCaller();
976  if (CurrentCaller != Caller) {
977  CurrentCaller = Caller;
978  ComputeCurrBFI(Caller);
979  } else {
980  assert(CurrentCallerBFI && "CallerBFI is not set");
981  }
982  BasicBlock *CallBB = CB->getParent();
983  auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
984  if (Count)
985  CallSiteToProfCountMap[User] = *Count;
986  else
987  CallSiteToProfCountMap[User] = 0;
988  }
989 }
990 
991 PartialInlinerImpl::FunctionCloner::FunctionCloner(
992  Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
993  function_ref<AssumptionCache *(Function &)> LookupAC,
995  : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
996  ClonedOI = std::make_unique<FunctionOutliningInfo>();
997 
998  // Clone the function, so that we can hack away on it.
999  ValueToValueMapTy VMap;
1000  ClonedFunc = CloneFunction(F, VMap);
1001 
1002  ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
1003  ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
1004  for (BasicBlock *BB : OI->Entries)
1005  ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
1006 
1007  for (BasicBlock *E : OI->ReturnBlockPreds) {
1008  BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
1009  ClonedOI->ReturnBlockPreds.push_back(NewE);
1010  }
1011  // Go ahead and update all uses to the duplicate, so that we can just
1012  // use the inliner functionality when we're done hacking.
1013  F->replaceAllUsesWith(ClonedFunc);
1014 }
1015 
1016 PartialInlinerImpl::FunctionCloner::FunctionCloner(
1017  Function *F, FunctionOutliningMultiRegionInfo *OI,
1019  function_ref<AssumptionCache *(Function &)> LookupAC,
1021  : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
1022  ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
1023 
1024  // Clone the function, so that we can hack away on it.
1025  ValueToValueMapTy VMap;
1026  ClonedFunc = CloneFunction(F, VMap);
1027 
1028  // Go through all Outline Candidate Regions and update all BasicBlock
1029  // information.
1030  for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1031  OI->ORI) {
1033  for (BasicBlock *BB : RegionInfo.Region)
1034  Region.push_back(cast<BasicBlock>(VMap[BB]));
1035 
1036  BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
1037  BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
1038  BasicBlock *NewReturnBlock = nullptr;
1039  if (RegionInfo.ReturnBlock)
1040  NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
1041  FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
1042  Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
1043  ClonedOMRI->ORI.push_back(MappedRegionInfo);
1044  }
1045  // Go ahead and update all uses to the duplicate, so that we can just
1046  // use the inliner functionality when we're done hacking.
1047  F->replaceAllUsesWith(ClonedFunc);
1048 }
1049 
1050 void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const {
1051  auto GetFirstPHI = [](BasicBlock *BB) {
1052  BasicBlock::iterator I = BB->begin();
1053  PHINode *FirstPhi = nullptr;
1054  while (I != BB->end()) {
1055  PHINode *Phi = dyn_cast<PHINode>(I);
1056  if (!Phi)
1057  break;
1058  if (!FirstPhi) {
1059  FirstPhi = Phi;
1060  break;
1061  }
1062  }
1063  return FirstPhi;
1064  };
1065 
1066  // Shouldn't need to normalize PHIs if we're not outlining non-early return
1067  // blocks.
1068  if (!ClonedOI)
1069  return;
1070 
1071  // Special hackery is needed with PHI nodes that have inputs from more than
1072  // one extracted block. For simplicity, just split the PHIs into a two-level
1073  // sequence of PHIs, some of which will go in the extracted region, and some
1074  // of which will go outside.
1075  BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1076  // only split block when necessary:
1077  PHINode *FirstPhi = GetFirstPHI(PreReturn);
1078  unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1079 
1080  if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1081  return;
1082 
1083  auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1084  Value *CommonValue = PN->getIncomingValue(0);
1085  if (all_of(PN->incoming_values(),
1086  [&](Value *V) { return V == CommonValue; }))
1087  return CommonValue;
1088  return nullptr;
1089  };
1090 
1091  ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1092  ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
1093  BasicBlock::iterator I = PreReturn->begin();
1094  Instruction *Ins = &ClonedOI->ReturnBlock->front();
1096  while (I != PreReturn->end()) {
1097  PHINode *OldPhi = dyn_cast<PHINode>(I);
1098  if (!OldPhi)
1099  break;
1100 
1101  PHINode *RetPhi =
1102  PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
1103  OldPhi->replaceAllUsesWith(RetPhi);
1104  Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
1105 
1106  RetPhi->addIncoming(&*I, PreReturn);
1107  for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1108  RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
1109  OldPhi->removeIncomingValue(E);
1110  }
1111 
1112  // After incoming values splitting, the old phi may become trivial.
1113  // Keeping the trivial phi can introduce definition inside the outline
1114  // region which is live-out, causing necessary overhead (load, store
1115  // arg passing etc).
1116  if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1117  OldPhi->replaceAllUsesWith(OldPhiVal);
1118  DeadPhis.push_back(OldPhi);
1119  }
1120  ++I;
1121  }
1122  for (auto *DP : DeadPhis)
1123  DP->eraseFromParent();
1124 
1125  for (auto *E : ClonedOI->ReturnBlockPreds)
1126  E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1127 }
1128 
1129 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1130 
1131  auto ComputeRegionCost =
1133  InstructionCost Cost = 0;
1134  for (BasicBlock* BB : Region)
1135  Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
1136  return Cost;
1137  };
1138 
1139  assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1140 
1141  if (ClonedOMRI->ORI.empty())
1142  return false;
1143 
1144  // The CodeExtractor needs a dominator tree.
1145  DominatorTree DT;
1146  DT.recalculate(*ClonedFunc);
1147 
1148  // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1149  LoopInfo LI(DT);
1150  BranchProbabilityInfo BPI(*ClonedFunc, LI);
1151  ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1152 
1153  // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
1154  CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1155 
1156  SetVector<Value *> Inputs, Outputs, Sinks;
1157  for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1158  ClonedOMRI->ORI) {
1159  InstructionCost CurrentOutlinedRegionCost =
1160  ComputeRegionCost(RegionInfo.Region);
1161 
1162  CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
1163  ClonedFuncBFI.get(), &BPI,
1164  LookupAC(*RegionInfo.EntryBlock->getParent()),
1165  /* AllowVarargs */ false);
1166 
1167  CE.findInputsOutputs(Inputs, Outputs, Sinks);
1168 
1169  LLVM_DEBUG({
1170  dbgs() << "inputs: " << Inputs.size() << "\n";
1171  dbgs() << "outputs: " << Outputs.size() << "\n";
1172  for (Value *value : Inputs)
1173  dbgs() << "value used in func: " << *value << "\n";
1174  for (Value *output : Outputs)
1175  dbgs() << "instr used in func: " << *output << "\n";
1176  });
1177 
1178  // Do not extract regions that have live exit variables.
1179  if (Outputs.size() > 0 && !ForceLiveExit)
1180  continue;
1181 
1182  if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) {
1183  CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc);
1184  BasicBlock *OutliningCallBB = OCS->getParent();
1185  assert(OutliningCallBB->getParent() == ClonedFunc);
1186  OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1187  NumColdRegionsOutlined++;
1188  OutlinedRegionCost += CurrentOutlinedRegionCost;
1189 
1190  if (MarkOutlinedColdCC) {
1191  OutlinedFunc->setCallingConv(CallingConv::Cold);
1193  }
1194  } else
1195  ORE.emit([&]() {
1196  return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1197  &RegionInfo.Region.front()->front())
1198  << "Failed to extract region at block "
1199  << ore::NV("Block", RegionInfo.Region.front());
1200  });
1201  }
1202 
1203  return !OutlinedFunctions.empty();
1204 }
1205 
1206 Function *
1207 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1208  // Returns true if the block is to be partial inlined into the caller
1209  // (i.e. not to be extracted to the out of line function)
1210  auto ToBeInlined = [&, this](BasicBlock *BB) {
1211  return BB == ClonedOI->ReturnBlock ||
1212  llvm::is_contained(ClonedOI->Entries, BB);
1213  };
1214 
1215  assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1216  // The CodeExtractor needs a dominator tree.
1217  DominatorTree DT;
1218  DT.recalculate(*ClonedFunc);
1219 
1220  // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1221  LoopInfo LI(DT);
1222  BranchProbabilityInfo BPI(*ClonedFunc, LI);
1223  ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1224 
1225  // Gather up the blocks that we're going to extract.
1226  std::vector<BasicBlock *> ToExtract;
1227  auto *ClonedFuncTTI = &GetTTI(*ClonedFunc);
1228  ToExtract.push_back(ClonedOI->NonReturnBlock);
1229  OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost(
1230  ClonedOI->NonReturnBlock, ClonedFuncTTI);
1231  for (BasicBlock &BB : *ClonedFunc)
1232  if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
1233  ToExtract.push_back(&BB);
1234  // FIXME: the code extractor may hoist/sink more code
1235  // into the outlined function which may make the outlining
1236  // overhead (the difference of the outlined function cost
1237  // and OutliningRegionCost) look larger.
1238  OutlinedRegionCost += computeBBInlineCost(&BB, ClonedFuncTTI);
1239  }
1240 
1241  // Extract the body of the if.
1242  CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1243  Function *OutlinedFunc =
1244  CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
1245  ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
1246  /* AllowVarargs */ true)
1247  .extractCodeRegion(CEAC);
1248 
1249  if (OutlinedFunc) {
1250  BasicBlock *OutliningCallBB =
1251  PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->getParent();
1252  assert(OutliningCallBB->getParent() == ClonedFunc);
1253  OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1254  } else
1255  ORE.emit([&]() {
1256  return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1257  &ToExtract.front()->front())
1258  << "Failed to extract region at block "
1259  << ore::NV("Block", ToExtract.front());
1260  });
1261 
1262  return OutlinedFunc;
1263 }
1264 
1265 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1266  // Ditch the duplicate, since we're done with it, and rewrite all remaining
1267  // users (function pointers, etc.) back to the original function.
1268  ClonedFunc->replaceAllUsesWith(OrigFunc);
1269  ClonedFunc->eraseFromParent();
1270  if (!IsFunctionInlined) {
1271  // Remove each function that was speculatively created if there is no
1272  // reference.
1273  for (auto FuncBBPair : OutlinedFunctions) {
1274  Function *Func = FuncBBPair.first;
1275  Func->eraseFromParent();
1276  }
1277  }
1278 }
1279 
1280 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &F) {
1281  if (F.hasAddressTaken())
1282  return {false, nullptr};
1283 
1284  // Let inliner handle it
1285  if (F.hasFnAttribute(Attribute::AlwaysInline))
1286  return {false, nullptr};
1287 
1288  if (F.hasFnAttribute(Attribute::NoInline))
1289  return {false, nullptr};
1290 
1291  if (PSI.isFunctionEntryCold(&F))
1292  return {false, nullptr};
1293 
1294  if (F.users().empty())
1295  return {false, nullptr};
1296 
1298 
1299  // Only try to outline cold regions if we have a profile summary, which
1300  // implies we have profiling information.
1301  if (PSI.hasProfileSummary() && F.hasProfileData() &&
1303  std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1304  computeOutliningColdRegionsInfo(F, ORE);
1305  if (OMRI) {
1306  FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI);
1307 
1308  LLVM_DEBUG({
1309  dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n";
1310  dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold()
1311  << "\n";
1312  });
1313 
1314  bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1315 
1316  if (DidOutline) {
1317  LLVM_DEBUG({
1318  dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1319  Cloner.ClonedFunc->print(dbgs());
1320  dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1321  });
1322 
1323  if (tryPartialInline(Cloner))
1324  return {true, nullptr};
1325  }
1326  }
1327  }
1328 
1329  // Fall-thru to regular partial inlining if we:
1330  // i) can't find any cold regions to outline, or
1331  // ii) can't inline the outlined function anywhere.
1332  std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1333  if (!OI)
1334  return {false, nullptr};
1335 
1336  FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI);
1337  Cloner.normalizeReturnBlock();
1338 
1339  Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1340 
1341  if (!OutlinedFunction)
1342  return {false, nullptr};
1343 
1344  if (tryPartialInline(Cloner))
1345  return {true, OutlinedFunction};
1346 
1347  return {false, nullptr};
1348 }
1349 
1350 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1351  if (Cloner.OutlinedFunctions.empty())
1352  return false;
1353 
1354  int SizeCost = 0;
1355  BlockFrequency WeightedRcost;
1356  int NonWeightedRcost;
1357 
1358  auto OutliningCosts = computeOutliningCosts(Cloner);
1359  assert(std::get<0>(OutliningCosts).isValid() &&
1360  std::get<1>(OutliningCosts).isValid() && "Expected valid costs");
1361 
1362  SizeCost = *std::get<0>(OutliningCosts).getValue();
1363  NonWeightedRcost = *std::get<1>(OutliningCosts).getValue();
1364 
1365  // Only calculate RelativeToEntryFreq when we are doing single region
1366  // outlining.
1367  BranchProbability RelativeToEntryFreq;
1368  if (Cloner.ClonedOI)
1369  RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1370  else
1371  // RelativeToEntryFreq doesn't make sense when we have more than one
1372  // outlined call because each call will have a different relative frequency
1373  // to the entry block. We can consider using the average, but the
1374  // usefulness of that information is questionable. For now, assume we never
1375  // execute the calls to outlined functions.
1376  RelativeToEntryFreq = BranchProbability(0, 1);
1377 
1378  WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
1379 
1380  // The call sequence(s) to the outlined function(s) are larger than the sum of
1381  // the original outlined region size(s), it does not increase the chances of
1382  // inlining the function with outlining (The inliner uses the size increase to
1383  // model the cost of inlining a callee).
1384  if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1385  OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1386  DebugLoc DLoc;
1387  BasicBlock *Block;
1388  std::tie(DLoc, Block) = getOneDebugLoc(*Cloner.ClonedFunc);
1389  OrigFuncORE.emit([&]() {
1390  return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
1391  DLoc, Block)
1392  << ore::NV("Function", Cloner.OrigFunc)
1393  << " not partially inlined into callers (Original Size = "
1394  << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1395  << ", Size of call sequence to outlined function = "
1396  << ore::NV("NewSize", SizeCost) << ")";
1397  });
1398  return false;
1399  }
1400 
1401  assert(Cloner.OrigFunc->users().empty() &&
1402  "F's users should all be replaced!");
1403 
1404  std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1405  Cloner.ClonedFunc->user_end());
1406 
1407  DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1408  auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1409  if (CalleeEntryCount)
1410  computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1411 
1412  uint64_t CalleeEntryCountV =
1413  (CalleeEntryCount ? CalleeEntryCount->getCount() : 0);
1414 
1415  bool AnyInline = false;
1416  for (User *User : Users) {
1417  // Don't bother with BlockAddress used by CallBr for asm goto.
1418  if (isa<BlockAddress>(User))
1419  continue;
1420 
1421  CallBase *CB = getSupportedCallBase(User);
1422 
1423  if (isLimitReached())
1424  continue;
1425 
1426  OptimizationRemarkEmitter CallerORE(CB->getCaller());
1427  if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE))
1428  continue;
1429 
1430  // Construct remark before doing the inlining, as after successful inlining
1431  // the callsite is removed.
1432  OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB);
1433  OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
1434  << ore::NV("Caller", CB->getCaller());
1435 
1436  InlineFunctionInfo IFI(nullptr, GetAssumptionCache, &PSI);
1437  // We can only forward varargs when we outlined a single region, else we
1438  // bail on vararg functions.
1439  if (!InlineFunction(*CB, IFI, nullptr, true,
1440  (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1441  : nullptr))
1442  .isSuccess())
1443  continue;
1444 
1445  CallerORE.emit(OR);
1446 
1447  // Now update the entry count:
1448  if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
1449  uint64_t CallSiteCount = CallSiteToProfCountMap[User];
1450  CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1451  }
1452 
1453  AnyInline = true;
1454  NumPartialInlining++;
1455  // Update the stats
1456  if (Cloner.ClonedOI)
1457  NumPartialInlined++;
1458  else
1459  NumColdOutlinePartialInlined++;
1460  }
1461 
1462  if (AnyInline) {
1463  Cloner.IsFunctionInlined = true;
1464  if (CalleeEntryCount)
1465  Cloner.OrigFunc->setEntryCount(Function::ProfileCount(
1466  CalleeEntryCountV, CalleeEntryCount->getType()));
1467  OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1468  OrigFuncORE.emit([&]() {
1469  return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1470  << "Partially inlined into at least one caller";
1471  });
1472  }
1473 
1474  return AnyInline;
1475 }
1476 
1479  return false;
1480 
1481  std::vector<Function *> Worklist;
1482  Worklist.reserve(M.size());
1483  for (Function &F : M)
1484  if (!F.use_empty() && !F.isDeclaration())
1485  Worklist.push_back(&F);
1486 
1487  bool Changed = false;
1488  while (!Worklist.empty()) {
1489  Function *CurrFunc = Worklist.back();
1490  Worklist.pop_back();
1491 
1492  if (CurrFunc->use_empty())
1493  continue;
1494 
1495  bool Recursive = false;
1496  for (User *U : CurrFunc->users())
1497  if (Instruction *I = dyn_cast<Instruction>(U))
1498  if (I->getParent()->getParent() == CurrFunc) {
1499  Recursive = true;
1500  break;
1501  }
1502  if (Recursive)
1503  continue;
1504 
1505  std::pair<bool, Function *> Result = unswitchFunction(*CurrFunc);
1506  if (Result.second)
1507  Worklist.push_back(Result.second);
1508  Changed |= Result.first;
1509  }
1510 
1511  return Changed;
1512 }
1513 
1515 
1516 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
1517  "Partial Inliner", false, false)
1522 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
1523  "Partial Inliner", false, false)
1524 
1526  return new PartialInlinerLegacyPass();
1527 }
1528 
1530  ModuleAnalysisManager &AM) {
1531  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1532 
1533  auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & {
1534  return FAM.getResult<AssumptionAnalysis>(F);
1535  };
1536 
1537  auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
1539  };
1540 
1541  auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
1543  };
1544 
1545  auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
1546  return FAM.getResult<TargetIRAnalysis>(F);
1547  };
1548 
1549  auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
1551  };
1552 
1554 
1555  if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
1556  GetTLI, PSI, GetBFI)
1557  .run(M))
1558  return PreservedAnalyses::none();
1559  return PreservedAnalyses::all();
1560 }
llvm::InstructionCost
Definition: InstructionCost.h:29
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
llvm::InlineCost::isAlways
bool isAlways() const
Definition: InlineCost.h:129
llvm::InlineCost::getCost
int getCost() const
Get the inline cost estimate.
Definition: InlineCost.h:135
llvm::BasicBlock::end
iterator end()
Definition: BasicBlock.h:299
llvm::TargetIRAnalysis
Analysis pass providing the TargetTransformInfo.
Definition: TargetTransformInfo.h:2479
llvm::OptimizationRemarkMissed
Diagnostic information for missed-optimization remarks.
Definition: DiagnosticInfo.h:735
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:17
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
Optional.h
ValueMapper.h
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:667
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:87
llvm::ModulePass
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition: Pass.h:248
llvm::InstructionCost::map
auto map(const Function &F) const -> InstructionCost
Definition: InstructionCost.h:245
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:104
IntrinsicInst.h
llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:780
PartialInlining.h
llvm::succ_end
Interval::succ_iterator succ_end(Interval *I)
Definition: Interval.h:102
T
llvm::Function
Definition: Function.h:60
DEBUG_TYPE
#define DEBUG_TYPE
Definition: PartialInlining.cpp:64
Pass.h
llvm::CodeExtractor::extractCodeRegion
Function * extractCodeRegion(const CodeExtractorAnalysisCache &CEAC)
Perform the extraction, returning the new function.
Definition: CodeExtractor.cpp:1626
llvm::SetVector::size
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:77
llvm::TargetTransformInfoWrapperPass::getTTI
TargetTransformInfo & getTTI(const Function &F)
Definition: TargetTransformInfo.cpp:1221
llvm::ProfileSummaryInfo::hasProfileSummary
bool hasProfileSummary() const
Returns true if profile summary is available.
Definition: ProfileSummaryInfo.h:68
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1185
Statistic.h
output
Current output
Definition: README.txt:1350
ErrorHandling.h
llvm::TargetTransformInfo
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Definition: TargetTransformInfo.h:168
llvm::TargetTransformInfo::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
Definition: TargetTransformInfo.cpp:917
llvm::PHINode::removeIncomingValue
Value * removeIncomingValue(unsigned Idx, bool DeletePHIIfEmpty=true)
Remove an incoming value.
Definition: Instructions.cpp:113
OptimizationRemarkEmitter.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
FAM
FunctionAnalysisManager FAM
Definition: PassBuilderBindings.cpp:59
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:139
MinBlockCounterExecution
static cl::opt< unsigned > MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, cl::desc("Minimum block executions to consider " "its BranchProbabilityInfo valid"))
llvm::PreservedAnalyses::none
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: PassManager.h:155
BlockFrequency.h
DenseMap.h
Module.h
llvm::sys::path::end
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:235
llvm::sys::path::begin
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
llvm::BasicBlock::getSingleSuccessor
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:291
llvm::ProfileSummaryInfo::isFunctionEntryCold
bool isFunctionEntryCold(const Function *F) const
Returns true if F has cold function entry.
Definition: ProfileSummaryInfo.cpp:221
llvm::isInlineViable
InlineResult isInlineViable(Function &Callee)
Minimal filter to detect invalid constructs for inlining.
Definition: InlineCost.cpp:2969
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::count
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:147
llvm::BranchProbability::getBranchProbability
static BranchProbability getBranchProbability(uint64_t Numerator, uint64_t Denominator)
Definition: BranchProbability.cpp:53
llvm::ore::NV
DiagnosticInfoOptimizationBase::Argument NV
Definition: OptimizationRemarkEmitter.h:136
Operator.h
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:397
CodeExtractor.h
llvm::successors
auto successors(MachineBasicBlock *BB)
Definition: MachineSSAContext.h:29
llvm::max
Expected< ExpressionValue > max(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:337
llvm::BasicBlock::getSinglePredecessor
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:261
STLExtras.h
llvm::InlineCost::isNever
bool isNever() const
Definition: InlineCost.h:130
llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:21
llvm::cl::ReallyHidden
@ ReallyHidden
Definition: CommandLine.h:140
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
F
#define F(x, y, z)
Definition: MD5.cpp:55
ColdBranchRatio
static cl::opt< float > ColdBranchRatio("cold-branch-ratio", cl::init(0.1), cl::Hidden, cl::desc("Minimum BranchProbability to consider a region cold."))
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:55
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
Instruction.h
CommandLine.h
llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1617
llvm::BlockFrequencyInfo
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Definition: BlockFrequencyInfo.h:37
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:31
llvm::getInlineCost
InlineCost getInlineCost(CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< const TargetLibraryInfo &(Function &)> GetTLI, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get an InlineCost object representing the cost of inlining this callsite.
Definition: InlineCost.cpp:2803
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::User
Definition: User.h:44
Intrinsics.h
InstrTypes.h
llvm::CallBase::getCalledFunction
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1396
llvm::BasicBlock::begin
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:297
llvm::BranchProbabilityInfo
Analysis providing branch probability information.
Definition: BranchProbabilityInfo.h:113
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::InlineCost
Represents the cost of inlining a function.
Definition: InlineCost.h:87
llvm::ms_demangle::QualifierMangleMode::Result
@ Result
TargetLibraryInfo.h
DenseSet.h
false
Definition: StackSlotColoring.cpp:141
llvm::succ_size
unsigned succ_size(MachineBasicBlock *BB)
Definition: MachineSSAContext.h:31
llvm::AssumptionCacheTracker::lookupAssumptionCache
AssumptionCache * lookupAssumptionCache(Function &F)
Return the cached assumptions for a function if it has already been scanned.
Definition: AssumptionCache.cpp:305
llvm::PHINode::getIncomingValueForBlock
Value * getIncomingValueForBlock(const BasicBlock *BB) const
Definition: Instructions.h:2836
llvm::pdb::PDB_SymType::Caller
@ Caller
llvm::Instruction
Definition: Instruction.h:42
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
llvm::predecessors
auto predecessors(MachineBasicBlock *BB)
Definition: MachineSSAContext.h:30
llvm::InlineResult::isSuccess
bool isSuccess() const
Definition: InlineCost.h:173
DebugLoc.h
llvm::getInlineParams
InlineParams getInlineParams()
Generate the parameters to tune the inline cost analysis based only on the commandline options.
Definition: InlineCost.cpp:3081
llvm::InlineConstants::InstrCost
const int InstrCost
Definition: InlineCost.h:47
llvm::BlockFrequencyAnalysis
Analysis pass which computes BlockFrequencyInfo.
Definition: BlockFrequencyInfo.h:112
llvm::PHINode::getNumIncomingValues
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Definition: Instructions.h:2743
MinRegionSizeRatio
static cl::opt< float > MinRegionSizeRatio("min-region-size-ratio", cl::init(0.1), cl::Hidden, cl::desc("Minimum ratio comparing relative sizes of each " "outline candidate and original function"))
llvm::Value::use_empty
bool use_empty() const
Definition: Value.h:344
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:118
BranchProbability.h
llvm::CallBase::getCaller
Function * getCaller()
Helper to get the caller (the parent function).
Definition: Instructions.cpp:282
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
CFG.h
llvm::dxil::PointerTypeAnalysis::run
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
Definition: PointerTypeAnalysis.cpp:101
LoopInfo.h
llvm::ProfileSummaryInfo
Analysis providing profile information.
Definition: ProfileSummaryInfo.h:39
llvm::InvokeInst
Invoke instruction.
Definition: Instructions.h:3763
OutlineRegionFreqPercent
static cl::opt< int > OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), cl::Hidden, cl::desc("Relative frequency of outline region to " "the entry block"))
llvm::function_ref
An efficient, type-erasing, non-owning reference to a callable.
Definition: STLFunctionalExtras.h:36
llvm::Value::user_end
user_iterator user_end()
Definition: Value.h:405
llvm::DenseSet
Implements a dense probed hash-table based set.
Definition: DenseSet.h:268
BasicBlock.h
llvm::cl::opt< bool >
llvm::BlockFrequency
Definition: BlockFrequency.h:23
BranchProbabilityInfo.h
llvm::TargetLibraryInfoWrapperPass
Definition: TargetLibraryInfo.h:468
uint64_t
ProfileSummaryInfo.h
llvm::TargetTransformInfoWrapperPass
Wrapper pass for TargetTransformInfo.
Definition: TargetTransformInfo.h:2535
llvm::AssumptionAnalysis
A function analysis which provides an AssumptionCache.
Definition: AssumptionCache.h:173
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
IPO.h
llvm::PHINode::addIncoming
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Definition: Instructions.h:2801
llvm::DenseMap
Definition: DenseMap.h:716
MaxNumPartialInlining
static cl::opt< int > MaxNumPartialInlining("max-partial-inlining", cl::init(-1), cl::Hidden, cl::desc("Max number of partial inlining. The default is unlimited"))
I
#define I(x, y, z)
Definition: MD5.cpp:58
Cloning.h
llvm::succ_begin
Interval::succ_iterator succ_begin(Interval *I)
succ_begin/succ_end - define methods so that Intervals may be used just like BasicBlocks can with the...
Definition: Interval.h:99
llvm::BlockFrequency::getFrequency
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
Definition: BlockFrequency.h:34
MarkOutlinedColdCC
static cl::opt< bool > MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden, cl::desc("Mark outline function calls with ColdCC"))
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
llvm::ProfileSummaryInfo::getColdCountThreshold
uint64_t getColdCountThreshold() const
Returns ColdCountThreshold if set.
Definition: ProfileSummaryInfo.h:176
llvm::ProfileSummaryInfoWrapperPass
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Definition: ProfileSummaryInfo.h:193
llvm::InlineCost::getCostDelta
int getCostDelta() const
Get the cost delta from the threshold for inlining.
Definition: InlineCost.h:159
llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1682
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::OptimizationRemarkEmitter::emit
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Definition: OptimizationRemarkEmitter.cpp:77
InlineCost.h
llvm::ProfileSummaryInfo::hasInstrumentationProfile
bool hasInstrumentationProfile() const
Returns true if module M has instrumentation profile.
Definition: ProfileSummaryInfo.h:77
llvm::WinEH::EncodingType::CE
@ CE
Windows NT (Windows on ARM)
inliner
partial inliner
Definition: PartialInlining.cpp:1522
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
OtherSucc
static MachineBasicBlock * OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ)
Definition: ARMISelLowering.cpp:11086
DisablePartialInlining
static cl::opt< bool > DisablePartialInlining("disable-partial-inlining", cl::init(false), cl::Hidden, cl::desc("Disable partial inlining"))
hasProfileData
static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI)
Definition: PartialInlining.cpp:711
llvm::ProfileSummaryAnalysis
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Definition: ProfileSummaryInfo.h:211
llvm::initializePartialInlinerLegacyPassPass
void initializePartialInlinerLegacyPassPass(PassRegistry &)
MaxNumInlineBlocks
static cl::opt< unsigned > MaxNumInlineBlocks("max-num-inline-blocks", cl::init(5), cl::Hidden, cl::desc("Max number of blocks to be partially inlined"))
llvm::DominatorTreeBase::recalculate
void recalculate(ParentType &Func)
recalculate - compute a dominator tree for the given function
Definition: GenericDomTree.h:778
llvm::AssumptionCacheTracker
An immutable pass that tracks lazily created AssumptionCache objects.
Definition: AssumptionCache.h:202
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::LoopInfo
Definition: LoopInfo.h:1102
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::Function::setCallingConv
void setCallingConv(CallingConv::ID CC)
Definition: Function.h:242
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:42
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:215
DP
So we should use XX3Form_Rcr to implement intrinsic Convert DP outs ins xscvdpsp No builtin are required Round &Convert QP DP(dword[1] is set to zero) No builtin are required Round to Quad Precision because you need to assign rounding mode in instruction Provide builtin(set f128:$vT,(int_ppc_vsx_xsrqpi f128:$vB))(set f128 yields< n x< ty > >< result > yields< ty >< result > No builtin are required Load Store load store see def memrix16 in PPCInstrInfo td Load Store Vector load store outs ins lxsdx set load store with conversion from to DP
Definition: README_P9.txt:520
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:529
llvm::BranchProbability
Definition: BranchProbability.h:30
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::CodeExtractor
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:85
llvm::RegionInfo
Definition: RegionInfo.h:900
llvm::CloneFunction
Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
Definition: CloneFunction.cpp:281
BlockFrequencyInfo.h
llvm::AMDGPUISD::BFI
@ BFI
Definition: AMDGPUISelLowering.h:429
llvm::OptimizationRemarkAnalysis
Diagnostic information for optimization analysis remarks.
Definition: DiagnosticInfo.h:781
llvm::ifs::IFSSymbolType::Func
@ Func
llvm::ValueMap< const Value *, WeakTrackingVH >
Callee
amdgpu Simplify well known AMD library false FunctionCallee Callee
Definition: AMDGPULibCalls.cpp:186
ForceLiveExit
static cl::opt< bool > ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden, cl::desc("Force outline regions with live exits"))
Attributes.h
llvm::AssumptionCacheTracker::getAssumptionCache
AssumptionCache & getAssumptionCache(Function &F)
Get the cached assumptions for a function.
Definition: AssumptionCache.cpp:285
llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:158
llvm::ISD::BR
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:982
llvm::Function::back
const BasicBlock & back() const
Definition: Function.h:735
llvm::AnalysisManager::getCachedResult
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:799
Casting.h
DiagnosticInfo.h
Function.h
SkipCostAnalysis
static cl::opt< bool > SkipCostAnalysis("skip-partial-inlining-cost-analysis", cl::ReallyHidden, cl::desc("Skip Cost Analysis"))
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:222
llvm::InlineFunctionInfo
This class captures the data input to the InlineFunction call, and records the auxiliary results prod...
Definition: Cloning.h:199
Inliner
partial Partial Inliner
Definition: PartialInlining.cpp:1523
llvm::CodeExtractorAnalysisCache
A cache for the CodeExtractor analysis.
Definition: CodeExtractor.h:46
llvm::ProfileSummaryInfo::isColdBlock
bool isColdBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI) const
Returns true if BasicBlock BB is considered cold.
Definition: ProfileSummaryInfo.cpp:331
DisableMultiRegionPartialInline
static cl::opt< bool > DisableMultiRegionPartialInline("disable-mr-partial-inlining", cl::init(false), cl::Hidden, cl::desc("Disable multi-region partial inlining"))
llvm::OptimizationRemark
Diagnostic information for applied optimization remarks.
Definition: DiagnosticInfo.h:690
isValid
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
Definition: RustDemangle.cpp:187
Instructions.h
llvm::getCallsiteCost
int getCallsiteCost(CallBase &Call, const DataLayout &DL)
Return the cost associated with a callsite, including parameter passing and the call/return instructi...
Definition: InlineCost.cpp:2770
SmallVector.h
llvm::Instruction::getDebugLoc
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:367
User.h
llvm::BlockFrequencyInfo::getBlockProfileCount
Optional< uint64_t > getBlockProfileCount(const BasicBlock *BB, bool AllowSynthetic=false) const
Returns the estimated profile count of BB.
Definition: BlockFrequencyInfo.cpp:208
Dominators.h
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:91
Users
iv Induction Variable Users
Definition: IVUsers.cpp:48
TargetTransformInfo.h
llvm::DenseMapBase::reserve
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition: DenseMap.h:105
llvm::PHINode
Definition: Instructions.h:2651
llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:119
llvm::ProfileSummaryInfo::getHotCountThreshold
uint64_t getHotCountThreshold() const
Returns HotCountThreshold if set.
Definition: ProfileSummaryInfo.h:172
llvm::MipsISD::Ins
@ Ins
Definition: MipsISelLowering.h:160
llvm::SmallVectorImpl< BasicBlock * >
llvm::InlineFunction
InlineResult InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, AAResults *CalleeAAR=nullptr, bool InsertLifetime=true, Function *ForwardVarArgsTo=nullptr)
This function inlines the called function into the basic block of the caller.
Definition: InlineFunction.cpp:1748
llvm::CallBase
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1174
llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: InstructionSimplify.h:42
llvm::createPartialInliningPass
ModulePass * createPartialInliningPass()
createPartialInliningPass - This pass inlines parts of functions.
Definition: PartialInlining.cpp:1525
llvm::InnerAnalysisManagerProxy
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:937
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1461
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", "Partial Inliner", false, false) INITIALIZE_PASS_END(PartialInlinerLegacyPass
ExtraOutliningPenalty
static cl::opt< unsigned > ExtraOutliningPenalty("partial-inlining-extra-penalty", cl::init(0), cl::Hidden, cl::desc("A debug option to add additional penalty to the computed one."))
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::SwitchInst
Multiway switch.
Definition: Instructions.h:3230
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::Function::ProfileCount
Class to represent profile counts.
Definition: Function.h:253
llvm::cl::desc
Definition: CommandLine.h:405
llvm::Region
Definition: RegionInfo.h:889
llvm::BranchInst
Conditional or Unconditional Branch instruction.
Definition: Instructions.h:3086
llvm::SetVector< Value * >
llvm::pdb::PDB_SymType::Block
@ Block
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::TargetLibraryAnalysis
Analysis pass providing the TargetLibraryInfo.
Definition: TargetLibraryInfo.h:443
llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:421
llvm::CallingConv::Cold
@ Cold
Definition: CallingConv.h:48
llvm::CallBase::setCallingConv
void setCallingConv(CallingConv::ID CC)
Definition: InstrTypes.h:1459
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38