LLVM  16.0.0git
PartialInlining.cpp
Go to the documentation of this file.
1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass performs partial inlining, typically by inlining an if statement
10 // that surrounds the body of the function.
11 //
12 //===----------------------------------------------------------------------===//
13 
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/DenseSet.h"
17 #include "llvm/ADT/Optional.h"
18 #include "llvm/ADT/STLExtras.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/Statistic.h"
24 #include "llvm/Analysis/LoopInfo.h"
29 #include "llvm/IR/Attributes.h"
30 #include "llvm/IR/BasicBlock.h"
31 #include "llvm/IR/CFG.h"
32 #include "llvm/IR/DebugLoc.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Dominators.h"
35 #include "llvm/IR/Function.h"
36 #include "llvm/IR/InstrTypes.h"
37 #include "llvm/IR/Instruction.h"
38 #include "llvm/IR/Instructions.h"
39 #include "llvm/IR/IntrinsicInst.h"
40 #include "llvm/IR/Intrinsics.h"
41 #include "llvm/IR/Module.h"
42 #include "llvm/IR/Operator.h"
43 #include "llvm/IR/ProfDataUtils.h"
44 #include "llvm/IR/User.h"
45 #include "llvm/InitializePasses.h"
46 #include "llvm/Pass.h"
49 #include "llvm/Support/Casting.h"
52 #include "llvm/Transforms/IPO.h"
56 #include <algorithm>
57 #include <cassert>
58 #include <cstdint>
59 #include <memory>
60 #include <tuple>
61 #include <vector>
62 
63 using namespace llvm;
64 
65 #define DEBUG_TYPE "partial-inlining"
66 
67 STATISTIC(NumPartialInlined,
68  "Number of callsites functions partially inlined into.");
69 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
70  "cold outlined regions were partially "
71  "inlined into its caller(s).");
72 STATISTIC(NumColdRegionsFound,
73  "Number of cold single entry/exit regions found.");
74 STATISTIC(NumColdRegionsOutlined,
75  "Number of cold single entry/exit regions outlined.");
76 
77 // Command line option to disable partial-inlining. The default is false:
78 static cl::opt<bool>
79  DisablePartialInlining("disable-partial-inlining", cl::init(false),
80  cl::Hidden, cl::desc("Disable partial inlining"));
81 // Command line option to disable multi-region partial-inlining. The default is
82 // false:
84  "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
85  cl::desc("Disable multi-region partial inlining"));
86 
87 // Command line option to force outlining in regions with live exit variables.
88 // The default is false:
89 static cl::opt<bool>
90  ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
91  cl::desc("Force outline regions with live exits"));
92 
93 // Command line option to enable marking outline functions with Cold Calling
94 // Convention. The default is false:
95 static cl::opt<bool>
96  MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
97  cl::desc("Mark outline function calls with ColdCC"));
98 
99 // This is an option used by testing:
100 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
101 
103  cl::desc("Skip Cost Analysis"));
104 // Used to determine if a cold region is worth outlining based on
105 // its inlining cost compared to the original function. Default is set at 10%.
106 // ie. if the cold region reduces the inlining cost of the original function by
107 // at least 10%.
109  "min-region-size-ratio", cl::init(0.1), cl::Hidden,
110  cl::desc("Minimum ratio comparing relative sizes of each "
111  "outline candidate and original function"));
112 // Used to tune the minimum number of execution counts needed in the predecessor
113 // block to the cold edge. ie. confidence interval.
114 static cl::opt<unsigned>
115  MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
116  cl::desc("Minimum block executions to consider "
117  "its BranchProbabilityInfo valid"));
118 // Used to determine when an edge is considered cold. Default is set to 10%. ie.
119 // if the branch probability is 10% or less, then it is deemed as 'cold'.
121  "cold-branch-ratio", cl::init(0.1), cl::Hidden,
122  cl::desc("Minimum BranchProbability to consider a region cold."));
123 
125  "max-num-inline-blocks", cl::init(5), cl::Hidden,
126  cl::desc("Max number of blocks to be partially inlined"));
127 
128 // Command line option to set the maximum number of partial inlining allowed
129 // for the module. The default value of -1 means no limit.
131  "max-partial-inlining", cl::init(-1), cl::Hidden,
132  cl::desc("Max number of partial inlining. The default is unlimited"));
133 
134 // Used only when PGO or user annotated branch data is absent. It is
135 // the least value that is used to weigh the outline region. If BFI
136 // produces larger value, the BFI value will be used.
137 static cl::opt<int>
138  OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
139  cl::Hidden,
140  cl::desc("Relative frequency of outline region to "
141  "the entry block"));
142 
144  "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
145  cl::desc("A debug option to add additional penalty to the computed one."));
146 
147 namespace {
148 
149 struct FunctionOutliningInfo {
150  FunctionOutliningInfo() = default;
151 
152  // Returns the number of blocks to be inlined including all blocks
153  // in Entries and one return block.
154  unsigned getNumInlinedBlocks() const { return Entries.size() + 1; }
155 
156  // A set of blocks including the function entry that guard
157  // the region to be outlined.
159 
160  // The return block that is not included in the outlined region.
161  BasicBlock *ReturnBlock = nullptr;
162 
163  // The dominating block of the region to be outlined.
164  BasicBlock *NonReturnBlock = nullptr;
165 
166  // The set of blocks in Entries that that are predecessors to ReturnBlock
167  SmallVector<BasicBlock *, 4> ReturnBlockPreds;
168 };
169 
170 struct FunctionOutliningMultiRegionInfo {
171  FunctionOutliningMultiRegionInfo() = default;
172 
173  // Container for outline regions
174  struct OutlineRegionInfo {
175  OutlineRegionInfo(ArrayRef<BasicBlock *> Region,
176  BasicBlock *EntryBlock, BasicBlock *ExitBlock,
177  BasicBlock *ReturnBlock)
178  : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock),
179  ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {}
181  BasicBlock *EntryBlock;
182  BasicBlock *ExitBlock;
183  BasicBlock *ReturnBlock;
184  };
185 
187 };
188 
189 struct PartialInlinerImpl {
190 
191  PartialInlinerImpl(
193  function_ref<AssumptionCache *(Function &)> LookupAC,
195  function_ref<const TargetLibraryInfo &(Function &)> GTLI,
196  ProfileSummaryInfo &ProfSI,
197  function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr)
198  : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
199  GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}
200 
201  bool run(Module &M);
202  // Main part of the transformation that calls helper functions to find
203  // outlining candidates, clone & outline the function, and attempt to
204  // partially inline the resulting function. Returns true if
205  // inlining was successful, false otherwise. Also returns the outline
206  // function (only if we partially inlined early returns) as there is a
207  // possibility to further "peel" early return statements that were left in the
208  // outline function due to code size.
209  std::pair<bool, Function *> unswitchFunction(Function &F);
210 
211  // This class speculatively clones the function to be partial inlined.
212  // At the end of partial inlining, the remaining callsites to the cloned
213  // function that are not partially inlined will be fixed up to reference
214  // the original function, and the cloned function will be erased.
215  struct FunctionCloner {
216  // Two constructors, one for single region outlining, the other for
217  // multi-region outlining.
218  FunctionCloner(Function *F, FunctionOutliningInfo *OI,
220  function_ref<AssumptionCache *(Function &)> LookupAC,
222  FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
224  function_ref<AssumptionCache *(Function &)> LookupAC,
226 
227  ~FunctionCloner();
228 
229  // Prepare for function outlining: making sure there is only
230  // one incoming edge from the extracted/outlined region to
231  // the return block.
232  void normalizeReturnBlock() const;
233 
234  // Do function outlining for cold regions.
235  bool doMultiRegionFunctionOutlining();
236  // Do function outlining for region after early return block(s).
237  // NOTE: For vararg functions that do the vararg handling in the outlined
238  // function, we temporarily generate IR that does not properly
239  // forward varargs to the outlined function. Calling InlineFunction
240  // will update calls to the outlined functions to properly forward
241  // the varargs.
242  Function *doSingleRegionFunctionOutlining();
243 
244  Function *OrigFunc = nullptr;
245  Function *ClonedFunc = nullptr;
246 
247  typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
248  // Keep track of Outlined Functions and the basic block they're called from.
249  SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
250 
251  // ClonedFunc is inlined in one of its callers after function
252  // outlining.
253  bool IsFunctionInlined = false;
254  // The cost of the region to be outlined.
255  InstructionCost OutlinedRegionCost = 0;
256  // ClonedOI is specific to outlining non-early return blocks.
257  std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
258  // ClonedOMRI is specific to outlining cold regions.
259  std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
260  std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
262  function_ref<AssumptionCache *(Function &)> LookupAC;
264  };
265 
266 private:
267  int NumPartialInlining = 0;
268  function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
269  function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
272  function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
273  ProfileSummaryInfo &PSI;
274 
275  // Return the frequency of the OutlininingBB relative to F's entry point.
276  // The result is no larger than 1 and is represented using BP.
277  // (Note that the outlined region's 'head' block can only have incoming
278  // edges from the guarding entry blocks).
280  getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const;
281 
282  // Return true if the callee of CB should be partially inlined with
283  // profit.
284  bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
285  BlockFrequency WeightedOutliningRcost,
286  OptimizationRemarkEmitter &ORE) const;
287 
288  // Try to inline DuplicateFunction (cloned from F with call to
289  // the OutlinedFunction into its callers. Return true
290  // if there is any successful inlining.
291  bool tryPartialInline(FunctionCloner &Cloner);
292 
293  // Compute the mapping from use site of DuplicationFunction to the enclosing
294  // BB's profile count.
295  void
296  computeCallsiteToProfCountMap(Function *DuplicateFunction,
297  DenseMap<User *, uint64_t> &SiteCountMap) const;
298 
299  bool isLimitReached() const {
300  return (MaxNumPartialInlining != -1 &&
301  NumPartialInlining >= MaxNumPartialInlining);
302  }
303 
304  static CallBase *getSupportedCallBase(User *U) {
305  if (isa<CallInst>(U) || isa<InvokeInst>(U))
306  return cast<CallBase>(U);
307  llvm_unreachable("All uses must be calls");
308  return nullptr;
309  }
310 
311  static CallBase *getOneCallSiteTo(Function &F) {
312  User *User = *F.user_begin();
313  return getSupportedCallBase(User);
314  }
315 
316  std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &F) const {
317  CallBase *CB = getOneCallSiteTo(F);
318  DebugLoc DLoc = CB->getDebugLoc();
319  BasicBlock *Block = CB->getParent();
320  return std::make_tuple(DLoc, Block);
321  }
322 
323  // Returns the costs associated with function outlining:
324  // - The first value is the non-weighted runtime cost for making the call
325  // to the outlined function, including the addtional setup cost in the
326  // outlined function itself;
327  // - The second value is the estimated size of the new call sequence in
328  // basic block Cloner.OutliningCallBB;
329  std::tuple<InstructionCost, InstructionCost>
330  computeOutliningCosts(FunctionCloner &Cloner) const;
331 
332  // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
333  // approximate both the size and runtime cost (Note that in the current
334  // inline cost analysis, there is no clear distinction there either).
335  static InstructionCost computeBBInlineCost(BasicBlock *BB,
337 
338  std::unique_ptr<FunctionOutliningInfo>
339  computeOutliningInfo(Function &F) const;
340 
341  std::unique_ptr<FunctionOutliningMultiRegionInfo>
342  computeOutliningColdRegionsInfo(Function &F,
343  OptimizationRemarkEmitter &ORE) const;
344 };
345 
346 struct PartialInlinerLegacyPass : public ModulePass {
347  static char ID; // Pass identification, replacement for typeid
348 
349  PartialInlinerLegacyPass() : ModulePass(ID) {
351  }
352 
353  void getAnalysisUsage(AnalysisUsage &AU) const override {
358  }
359 
360  bool runOnModule(Module &M) override {
361  if (skipModule(M))
362  return false;
363 
364  AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
366  &getAnalysis<TargetTransformInfoWrapperPass>();
367  ProfileSummaryInfo &PSI =
368  getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
369 
370  auto GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & {
371  return ACT->getAssumptionCache(F);
372  };
373 
374  auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * {
375  return ACT->lookupAssumptionCache(F);
376  };
377 
378  auto GetTTI = [&TTIWP](Function &F) -> TargetTransformInfo & {
379  return TTIWP->getTTI(F);
380  };
381 
382  auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
383  return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
384  };
385 
386  return PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
387  GetTLI, PSI)
388  .run(M);
389  }
390 };
391 
392 } // end anonymous namespace
393 
394 std::unique_ptr<FunctionOutliningMultiRegionInfo>
395 PartialInlinerImpl::computeOutliningColdRegionsInfo(
396  Function &F, OptimizationRemarkEmitter &ORE) const {
397  BasicBlock *EntryBlock = &F.front();
398 
399  DominatorTree DT(F);
400  LoopInfo LI(DT);
401  BranchProbabilityInfo BPI(F, LI);
402  std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
404  if (!GetBFI) {
405  ScopedBFI.reset(new BlockFrequencyInfo(F, BPI, LI));
406  BFI = ScopedBFI.get();
407  } else
408  BFI = &(GetBFI(F));
409 
410  // Return if we don't have profiling information.
411  if (!PSI.hasInstrumentationProfile())
412  return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
413 
414  std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
415  std::make_unique<FunctionOutliningMultiRegionInfo>();
416 
417  auto IsSingleExit =
418  [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
419  BasicBlock *ExitBlock = nullptr;
420  for (auto *Block : BlockList) {
421  for (BasicBlock *Succ : successors(Block)) {
422  if (!is_contained(BlockList, Succ)) {
423  if (ExitBlock) {
424  ORE.emit([&]() {
425  return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
426  &Succ->front())
427  << "Region dominated by "
428  << ore::NV("Block", BlockList.front()->getName())
429  << " has more than one region exit edge.";
430  });
431  return nullptr;
432  }
433 
434  ExitBlock = Block;
435  }
436  }
437  }
438  return ExitBlock;
439  };
440 
441  auto BBProfileCount = [BFI](BasicBlock *BB) {
442  return BFI->getBlockProfileCount(BB).value_or(0);
443  };
444 
445  // Use the same computeBBInlineCost function to compute the cost savings of
446  // the outlining the candidate region.
447  TargetTransformInfo *FTTI = &GetTTI(F);
448  InstructionCost OverallFunctionCost = 0;
449  for (auto &BB : F)
450  OverallFunctionCost += computeBBInlineCost(&BB, FTTI);
451 
452  LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost
453  << "\n";);
454 
455  InstructionCost MinOutlineRegionCost = OverallFunctionCost.map(
456  [&](auto Cost) { return Cost * MinRegionSizeRatio; });
457 
458  BranchProbability MinBranchProbability(
459  static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
461  bool ColdCandidateFound = false;
462  BasicBlock *CurrEntry = EntryBlock;
463  std::vector<BasicBlock *> DFS;
464  DenseMap<BasicBlock *, bool> VisitedMap;
465  DFS.push_back(CurrEntry);
466  VisitedMap[CurrEntry] = true;
467 
468  // Use Depth First Search on the basic blocks to find CFG edges that are
469  // considered cold.
470  // Cold regions considered must also have its inline cost compared to the
471  // overall inline cost of the original function. The region is outlined only
472  // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
473  // more.
474  while (!DFS.empty()) {
475  auto *ThisBB = DFS.back();
476  DFS.pop_back();
477  // Only consider regions with predecessor blocks that are considered
478  // not-cold (default: part of the top 99.99% of all block counters)
479  // AND greater than our minimum block execution count (default: 100).
480  if (PSI.isColdBlock(ThisBB, BFI) ||
481  BBProfileCount(ThisBB) < MinBlockCounterExecution)
482  continue;
483  for (auto SI = succ_begin(ThisBB); SI != succ_end(ThisBB); ++SI) {
484  if (VisitedMap[*SI])
485  continue;
486  VisitedMap[*SI] = true;
487  DFS.push_back(*SI);
488  // If branch isn't cold, we skip to the next one.
489  BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI);
490  if (SuccProb > MinBranchProbability)
491  continue;
492 
493  LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->"
494  << SI->getName()
495  << "\nBranch Probability = " << SuccProb << "\n";);
496 
497  SmallVector<BasicBlock *, 8> DominateVector;
498  DT.getDescendants(*SI, DominateVector);
499  assert(!DominateVector.empty() &&
500  "SI should be reachable and have at least itself as descendant");
501 
502  // We can only outline single entry regions (for now).
503  if (!DominateVector.front()->hasNPredecessors(1)) {
504  LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
505  << " doesn't have a single predecessor in the "
506  "dominator tree\n";);
507  continue;
508  }
509 
510  BasicBlock *ExitBlock = nullptr;
511  // We can only outline single exit regions (for now).
512  if (!(ExitBlock = IsSingleExit(DominateVector))) {
513  LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
514  << " doesn't have a unique successor\n";);
515  continue;
516  }
517 
518  InstructionCost OutlineRegionCost = 0;
519  for (auto *BB : DominateVector)
520  OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
521 
522  LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost
523  << "\n";);
524 
525  if (!SkipCostAnalysis && OutlineRegionCost < MinOutlineRegionCost) {
526  ORE.emit([&]() {
527  return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
528  &SI->front())
529  << ore::NV("Callee", &F)
530  << " inline cost-savings smaller than "
531  << ore::NV("Cost", MinOutlineRegionCost);
532  });
533 
534  LLVM_DEBUG(dbgs() << "ABORT: Outline region cost is smaller than "
535  << MinOutlineRegionCost << "\n";);
536  continue;
537  }
538 
539  // For now, ignore blocks that belong to a SISE region that is a
540  // candidate for outlining. In the future, we may want to look
541  // at inner regions because the outer region may have live-exit
542  // variables.
543  for (auto *BB : DominateVector)
544  VisitedMap[BB] = true;
545 
546  // ReturnBlock here means the block after the outline call
547  BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
548  FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
549  DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
550  OutliningInfo->ORI.push_back(RegInfo);
551  LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: "
552  << DominateVector.front()->getName() << "\n";);
553  ColdCandidateFound = true;
554  NumColdRegionsFound++;
555  }
556  }
557 
558  if (ColdCandidateFound)
559  return OutliningInfo;
560 
561  return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
562 }
563 
564 std::unique_ptr<FunctionOutliningInfo>
565 PartialInlinerImpl::computeOutliningInfo(Function &F) const {
566  BasicBlock *EntryBlock = &F.front();
567  BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
568  if (!BR || BR->isUnconditional())
569  return std::unique_ptr<FunctionOutliningInfo>();
570 
571  // Returns true if Succ is BB's successor
572  auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
573  return is_contained(successors(BB), Succ);
574  };
575 
576  auto IsReturnBlock = [](BasicBlock *BB) {
577  Instruction *TI = BB->getTerminator();
578  return isa<ReturnInst>(TI);
579  };
580 
581  auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
582  if (IsReturnBlock(Succ1))
583  return std::make_tuple(Succ1, Succ2);
584  if (IsReturnBlock(Succ2))
585  return std::make_tuple(Succ2, Succ1);
586 
587  return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
588  };
589 
590  // Detect a triangular shape:
591  auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
592  if (IsSuccessor(Succ1, Succ2))
593  return std::make_tuple(Succ1, Succ2);
594  if (IsSuccessor(Succ2, Succ1))
595  return std::make_tuple(Succ2, Succ1);
596 
597  return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
598  };
599 
600  std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
601  std::make_unique<FunctionOutliningInfo>();
602 
603  BasicBlock *CurrEntry = EntryBlock;
604  bool CandidateFound = false;
605  do {
606  // The number of blocks to be inlined has already reached
607  // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
608  // disables partial inlining for the function.
609  if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks)
610  break;
611 
612  if (succ_size(CurrEntry) != 2)
613  break;
614 
615  BasicBlock *Succ1 = *succ_begin(CurrEntry);
616  BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
617 
618  BasicBlock *ReturnBlock, *NonReturnBlock;
619  std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
620 
621  if (ReturnBlock) {
622  OutliningInfo->Entries.push_back(CurrEntry);
623  OutliningInfo->ReturnBlock = ReturnBlock;
624  OutliningInfo->NonReturnBlock = NonReturnBlock;
625  CandidateFound = true;
626  break;
627  }
628 
629  BasicBlock *CommSucc, *OtherSucc;
630  std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
631 
632  if (!CommSucc)
633  break;
634 
635  OutliningInfo->Entries.push_back(CurrEntry);
636  CurrEntry = OtherSucc;
637  } while (true);
638 
639  if (!CandidateFound)
640  return std::unique_ptr<FunctionOutliningInfo>();
641 
642  // There should not be any successors (not in the entry set) other than
643  // {ReturnBlock, NonReturnBlock}
644  assert(OutliningInfo->Entries[0] == &F.front() &&
645  "Function Entry must be the first in Entries vector");
646  DenseSet<BasicBlock *> Entries;
647  for (BasicBlock *E : OutliningInfo->Entries)
648  Entries.insert(E);
649 
650  // Returns true of BB has Predecessor which is not
651  // in Entries set.
652  auto HasNonEntryPred = [Entries](BasicBlock *BB) {
653  for (auto *Pred : predecessors(BB)) {
654  if (!Entries.count(Pred))
655  return true;
656  }
657  return false;
658  };
659  auto CheckAndNormalizeCandidate =
660  [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
661  for (BasicBlock *E : OutliningInfo->Entries) {
662  for (auto *Succ : successors(E)) {
663  if (Entries.count(Succ))
664  continue;
665  if (Succ == OutliningInfo->ReturnBlock)
666  OutliningInfo->ReturnBlockPreds.push_back(E);
667  else if (Succ != OutliningInfo->NonReturnBlock)
668  return false;
669  }
670  // There should not be any outside incoming edges either:
671  if (HasNonEntryPred(E))
672  return false;
673  }
674  return true;
675  };
676 
677  if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
678  return std::unique_ptr<FunctionOutliningInfo>();
679 
680  // Now further growing the candidate's inlining region by
681  // peeling off dominating blocks from the outlining region:
682  while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) {
683  BasicBlock *Cand = OutliningInfo->NonReturnBlock;
684  if (succ_size(Cand) != 2)
685  break;
686 
687  if (HasNonEntryPred(Cand))
688  break;
689 
690  BasicBlock *Succ1 = *succ_begin(Cand);
691  BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
692 
693  BasicBlock *ReturnBlock, *NonReturnBlock;
694  std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
695  if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
696  break;
697 
698  if (NonReturnBlock->getSinglePredecessor() != Cand)
699  break;
700 
701  // Now grow and update OutlininigInfo:
702  OutliningInfo->Entries.push_back(Cand);
703  OutliningInfo->NonReturnBlock = NonReturnBlock;
704  OutliningInfo->ReturnBlockPreds.push_back(Cand);
705  Entries.insert(Cand);
706  }
707 
708  return OutliningInfo;
709 }
710 
711 // Check if there is PGO data or user annotated branch data:
712 static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) {
713  if (F.hasProfileData())
714  return true;
715  // Now check if any of the entry block has MD_prof data:
716  for (auto *E : OI.Entries) {
717  BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
718  if (!BR || BR->isUnconditional())
719  continue;
720  uint64_t T, F;
721  if (extractBranchWeights(*BR, T, F))
722  return true;
723  }
724  return false;
725 }
726 
727 BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
728  FunctionCloner &Cloner) const {
729  BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
730  auto EntryFreq =
731  Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
732  auto OutliningCallFreq =
733  Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
734  // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
735  // we outlined any regions, so we may encounter situations where the
736  // OutliningCallFreq is *slightly* bigger than the EntryFreq.
737  if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency())
738  OutliningCallFreq = EntryFreq;
739 
740  auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
741  OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
742 
743  if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI))
744  return OutlineRegionRelFreq;
745 
746  // When profile data is not available, we need to be conservative in
747  // estimating the overall savings. Static branch prediction can usually
748  // guess the branch direction right (taken/non-taken), but the guessed
749  // branch probability is usually not biased enough. In case when the
750  // outlined region is predicted to be likely, its probability needs
751  // to be made higher (more biased) to not under-estimate the cost of
752  // function outlining. On the other hand, if the outlined region
753  // is predicted to be less likely, the predicted probablity is usually
754  // higher than the actual. For instance, the actual probability of the
755  // less likely target is only 5%, but the guessed probablity can be
756  // 40%. In the latter case, there is no need for further adjustment.
757  // FIXME: add an option for this.
758  if (OutlineRegionRelFreq < BranchProbability(45, 100))
759  return OutlineRegionRelFreq;
760 
761  OutlineRegionRelFreq = std::max(
762  OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
763 
764  return OutlineRegionRelFreq;
765 }
766 
767 bool PartialInlinerImpl::shouldPartialInline(
768  CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
769  OptimizationRemarkEmitter &ORE) const {
770  using namespace ore;
771 
773  assert(Callee == Cloner.ClonedFunc);
774 
775  if (SkipCostAnalysis)
776  return isInlineViable(*Callee).isSuccess();
777 
778  Function *Caller = CB.getCaller();
779  auto &CalleeTTI = GetTTI(*Callee);
780  bool RemarksEnabled =
781  Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
782  DEBUG_TYPE);
783  InlineCost IC =
784  getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache,
785  GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr);
786 
787  if (IC.isAlways()) {
788  ORE.emit([&]() {
789  return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB)
790  << NV("Callee", Cloner.OrigFunc)
791  << " should always be fully inlined, not partially";
792  });
793  return false;
794  }
795 
796  if (IC.isNever()) {
797  ORE.emit([&]() {
798  return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB)
799  << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
800  << NV("Caller", Caller)
801  << " because it should never be inlined (cost=never)";
802  });
803  return false;
804  }
805 
806  if (!IC) {
807  ORE.emit([&]() {
808  return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB)
809  << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
810  << NV("Caller", Caller) << " because too costly to inline (cost="
811  << NV("Cost", IC.getCost()) << ", threshold="
812  << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
813  });
814  return false;
815  }
816  const DataLayout &DL = Caller->getParent()->getDataLayout();
817 
818  // The savings of eliminating the call:
819  int NonWeightedSavings = getCallsiteCost(CB, DL);
820  BlockFrequency NormWeightedSavings(NonWeightedSavings);
821 
822  // Weighted saving is smaller than weighted cost, return false
823  if (NormWeightedSavings < WeightedOutliningRcost) {
824  ORE.emit([&]() {
825  return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
826  &CB)
827  << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
828  << NV("Caller", Caller) << " runtime overhead (overhead="
829  << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
830  << ", savings="
831  << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
832  << ")"
833  << " of making the outlined call is too high";
834  });
835 
836  return false;
837  }
838 
839  ORE.emit([&]() {
840  return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB)
841  << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
842  << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
843  << " (threshold="
844  << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
845  });
846  return true;
847 }
848 
849 // TODO: Ideally we should share Inliner's InlineCost Analysis code.
850 // For now use a simplified version. The returned 'InlineCost' will be used
851 // to esimate the size cost as well as runtime cost of the BB.
853 PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
856  const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
858  for (Instruction &I : BB->instructionsWithoutDebug()) {
859  // Skip free instructions.
860  switch (I.getOpcode()) {
861  case Instruction::BitCast:
862  case Instruction::PtrToInt:
863  case Instruction::IntToPtr:
864  case Instruction::Alloca:
865  case Instruction::PHI:
866  continue;
867  case Instruction::GetElementPtr:
868  if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
869  continue;
870  break;
871  default:
872  break;
873  }
874 
875  if (I.isLifetimeStartOrEnd())
876  continue;
877 
878  if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
879  Intrinsic::ID IID = II->getIntrinsicID();
881  FastMathFlags FMF;
882  for (Value *Val : II->args())
883  Tys.push_back(Val->getType());
884 
885  if (auto *FPMO = dyn_cast<FPMathOperator>(II))
886  FMF = FPMO->getFastMathFlags();
887 
888  IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF);
890  continue;
891  }
892 
893  if (CallInst *CI = dyn_cast<CallInst>(&I)) {
894  InlineCost += getCallsiteCost(*CI, DL);
895  continue;
896  }
897 
898  if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
899  InlineCost += getCallsiteCost(*II, DL);
900  continue;
901  }
902 
903  if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
904  InlineCost += (SI->getNumCases() + 1) * InstrCost;
905  continue;
906  }
908  }
909 
910  return InlineCost;
911 }
912 
913 std::tuple<InstructionCost, InstructionCost>
914 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const {
915  InstructionCost OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
916  for (auto FuncBBPair : Cloner.OutlinedFunctions) {
917  Function *OutlinedFunc = FuncBBPair.first;
918  BasicBlock* OutliningCallBB = FuncBBPair.second;
919  // Now compute the cost of the call sequence to the outlined function
920  // 'OutlinedFunction' in BB 'OutliningCallBB':
921  auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc);
922  OutliningFuncCallCost +=
923  computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI);
924 
925  // Now compute the cost of the extracted/outlined function itself:
926  for (BasicBlock &BB : *OutlinedFunc)
927  OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI);
928  }
929  assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
930  "Outlined function cost should be no less than the outlined region");
931 
932  // The code extractor introduces a new root and exit stub blocks with
933  // additional unconditional branches. Those branches will be eliminated
934  // later with bb layout. The cost should be adjusted accordingly:
935  OutlinedFunctionCost -=
936  2 * InlineConstants::getInstrCost() * Cloner.OutlinedFunctions.size();
937 
938  InstructionCost OutliningRuntimeOverhead =
939  OutliningFuncCallCost +
940  (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
941  ExtraOutliningPenalty.getValue();
942 
943  return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
944 }
945 
946 // Create the callsite to profile count map which is
947 // used to update the original function's entry count,
948 // after the function is partially inlined into the callsite.
949 void PartialInlinerImpl::computeCallsiteToProfCountMap(
950  Function *DuplicateFunction,
951  DenseMap<User *, uint64_t> &CallSiteToProfCountMap) const {
952  std::vector<User *> Users(DuplicateFunction->user_begin(),
953  DuplicateFunction->user_end());
954  Function *CurrentCaller = nullptr;
955  std::unique_ptr<BlockFrequencyInfo> TempBFI;
956  BlockFrequencyInfo *CurrentCallerBFI = nullptr;
957 
958  auto ComputeCurrBFI = [&,this](Function *Caller) {
959  // For the old pass manager:
960  if (!GetBFI) {
961  DominatorTree DT(*Caller);
962  LoopInfo LI(DT);
963  BranchProbabilityInfo BPI(*Caller, LI);
964  TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
965  CurrentCallerBFI = TempBFI.get();
966  } else {
967  // New pass manager:
968  CurrentCallerBFI = &(GetBFI(*Caller));
969  }
970  };
971 
972  for (User *User : Users) {
973  // Don't bother with BlockAddress used by CallBr for asm goto.
974  if (isa<BlockAddress>(User))
975  continue;
976  CallBase *CB = getSupportedCallBase(User);
977  Function *Caller = CB->getCaller();
978  if (CurrentCaller != Caller) {
979  CurrentCaller = Caller;
980  ComputeCurrBFI(Caller);
981  } else {
982  assert(CurrentCallerBFI && "CallerBFI is not set");
983  }
984  BasicBlock *CallBB = CB->getParent();
985  auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
986  if (Count)
987  CallSiteToProfCountMap[User] = *Count;
988  else
989  CallSiteToProfCountMap[User] = 0;
990  }
991 }
992 
993 PartialInlinerImpl::FunctionCloner::FunctionCloner(
994  Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
995  function_ref<AssumptionCache *(Function &)> LookupAC,
997  : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
998  ClonedOI = std::make_unique<FunctionOutliningInfo>();
999 
1000  // Clone the function, so that we can hack away on it.
1001  ValueToValueMapTy VMap;
1002  ClonedFunc = CloneFunction(F, VMap);
1003 
1004  ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
1005  ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
1006  for (BasicBlock *BB : OI->Entries)
1007  ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
1008 
1009  for (BasicBlock *E : OI->ReturnBlockPreds) {
1010  BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
1011  ClonedOI->ReturnBlockPreds.push_back(NewE);
1012  }
1013  // Go ahead and update all uses to the duplicate, so that we can just
1014  // use the inliner functionality when we're done hacking.
1015  F->replaceAllUsesWith(ClonedFunc);
1016 }
1017 
1018 PartialInlinerImpl::FunctionCloner::FunctionCloner(
1019  Function *F, FunctionOutliningMultiRegionInfo *OI,
1021  function_ref<AssumptionCache *(Function &)> LookupAC,
1023  : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
1024  ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
1025 
1026  // Clone the function, so that we can hack away on it.
1027  ValueToValueMapTy VMap;
1028  ClonedFunc = CloneFunction(F, VMap);
1029 
1030  // Go through all Outline Candidate Regions and update all BasicBlock
1031  // information.
1032  for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1033  OI->ORI) {
1035  for (BasicBlock *BB : RegionInfo.Region)
1036  Region.push_back(cast<BasicBlock>(VMap[BB]));
1037 
1038  BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
1039  BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
1040  BasicBlock *NewReturnBlock = nullptr;
1041  if (RegionInfo.ReturnBlock)
1042  NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
1043  FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
1044  Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
1045  ClonedOMRI->ORI.push_back(MappedRegionInfo);
1046  }
1047  // Go ahead and update all uses to the duplicate, so that we can just
1048  // use the inliner functionality when we're done hacking.
1049  F->replaceAllUsesWith(ClonedFunc);
1050 }
1051 
1052 void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const {
1053  auto GetFirstPHI = [](BasicBlock *BB) {
1054  BasicBlock::iterator I = BB->begin();
1055  PHINode *FirstPhi = nullptr;
1056  while (I != BB->end()) {
1057  PHINode *Phi = dyn_cast<PHINode>(I);
1058  if (!Phi)
1059  break;
1060  if (!FirstPhi) {
1061  FirstPhi = Phi;
1062  break;
1063  }
1064  }
1065  return FirstPhi;
1066  };
1067 
1068  // Shouldn't need to normalize PHIs if we're not outlining non-early return
1069  // blocks.
1070  if (!ClonedOI)
1071  return;
1072 
1073  // Special hackery is needed with PHI nodes that have inputs from more than
1074  // one extracted block. For simplicity, just split the PHIs into a two-level
1075  // sequence of PHIs, some of which will go in the extracted region, and some
1076  // of which will go outside.
1077  BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1078  // only split block when necessary:
1079  PHINode *FirstPhi = GetFirstPHI(PreReturn);
1080  unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1081 
1082  if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1083  return;
1084 
1085  auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1086  if (llvm::all_equal(PN->incoming_values()))
1087  return PN->getIncomingValue(0);
1088  return nullptr;
1089  };
1090 
1091  ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1092  ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
1093  BasicBlock::iterator I = PreReturn->begin();
1094  Instruction *Ins = &ClonedOI->ReturnBlock->front();
1096  while (I != PreReturn->end()) {
1097  PHINode *OldPhi = dyn_cast<PHINode>(I);
1098  if (!OldPhi)
1099  break;
1100 
1101  PHINode *RetPhi =
1102  PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
1103  OldPhi->replaceAllUsesWith(RetPhi);
1104  Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
1105 
1106  RetPhi->addIncoming(&*I, PreReturn);
1107  for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1108  RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
1109  OldPhi->removeIncomingValue(E);
1110  }
1111 
1112  // After incoming values splitting, the old phi may become trivial.
1113  // Keeping the trivial phi can introduce definition inside the outline
1114  // region which is live-out, causing necessary overhead (load, store
1115  // arg passing etc).
1116  if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1117  OldPhi->replaceAllUsesWith(OldPhiVal);
1118  DeadPhis.push_back(OldPhi);
1119  }
1120  ++I;
1121  }
1122  for (auto *DP : DeadPhis)
1123  DP->eraseFromParent();
1124 
1125  for (auto *E : ClonedOI->ReturnBlockPreds)
1126  E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1127 }
1128 
1129 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1130 
1131  auto ComputeRegionCost =
1133  InstructionCost Cost = 0;
1134  for (BasicBlock* BB : Region)
1135  Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
1136  return Cost;
1137  };
1138 
1139  assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1140 
1141  if (ClonedOMRI->ORI.empty())
1142  return false;
1143 
1144  // The CodeExtractor needs a dominator tree.
1145  DominatorTree DT;
1146  DT.recalculate(*ClonedFunc);
1147 
1148  // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1149  LoopInfo LI(DT);
1150  BranchProbabilityInfo BPI(*ClonedFunc, LI);
1151  ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1152 
1153  // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
1154  CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1155 
1156  SetVector<Value *> Inputs, Outputs, Sinks;
1157  for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1158  ClonedOMRI->ORI) {
1159  InstructionCost CurrentOutlinedRegionCost =
1160  ComputeRegionCost(RegionInfo.Region);
1161 
1162  CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
1163  ClonedFuncBFI.get(), &BPI,
1164  LookupAC(*RegionInfo.EntryBlock->getParent()),
1165  /* AllowVarargs */ false);
1166 
1167  CE.findInputsOutputs(Inputs, Outputs, Sinks);
1168 
1169  LLVM_DEBUG({
1170  dbgs() << "inputs: " << Inputs.size() << "\n";
1171  dbgs() << "outputs: " << Outputs.size() << "\n";
1172  for (Value *value : Inputs)
1173  dbgs() << "value used in func: " << *value << "\n";
1174  for (Value *output : Outputs)
1175  dbgs() << "instr used in func: " << *output << "\n";
1176  });
1177 
1178  // Do not extract regions that have live exit variables.
1179  if (Outputs.size() > 0 && !ForceLiveExit)
1180  continue;
1181 
1182  if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) {
1183  CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc);
1184  BasicBlock *OutliningCallBB = OCS->getParent();
1185  assert(OutliningCallBB->getParent() == ClonedFunc);
1186  OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1187  NumColdRegionsOutlined++;
1188  OutlinedRegionCost += CurrentOutlinedRegionCost;
1189 
1190  if (MarkOutlinedColdCC) {
1191  OutlinedFunc->setCallingConv(CallingConv::Cold);
1193  }
1194  } else
1195  ORE.emit([&]() {
1196  return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1197  &RegionInfo.Region.front()->front())
1198  << "Failed to extract region at block "
1199  << ore::NV("Block", RegionInfo.Region.front());
1200  });
1201  }
1202 
1203  return !OutlinedFunctions.empty();
1204 }
1205 
1206 Function *
1207 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1208  // Returns true if the block is to be partial inlined into the caller
1209  // (i.e. not to be extracted to the out of line function)
1210  auto ToBeInlined = [&, this](BasicBlock *BB) {
1211  return BB == ClonedOI->ReturnBlock ||
1212  llvm::is_contained(ClonedOI->Entries, BB);
1213  };
1214 
1215  assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1216  // The CodeExtractor needs a dominator tree.
1217  DominatorTree DT;
1218  DT.recalculate(*ClonedFunc);
1219 
1220  // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1221  LoopInfo LI(DT);
1222  BranchProbabilityInfo BPI(*ClonedFunc, LI);
1223  ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1224 
1225  // Gather up the blocks that we're going to extract.
1226  std::vector<BasicBlock *> ToExtract;
1227  auto *ClonedFuncTTI = &GetTTI(*ClonedFunc);
1228  ToExtract.push_back(ClonedOI->NonReturnBlock);
1229  OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost(
1230  ClonedOI->NonReturnBlock, ClonedFuncTTI);
1231  for (BasicBlock &BB : *ClonedFunc)
1232  if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
1233  ToExtract.push_back(&BB);
1234  // FIXME: the code extractor may hoist/sink more code
1235  // into the outlined function which may make the outlining
1236  // overhead (the difference of the outlined function cost
1237  // and OutliningRegionCost) look larger.
1238  OutlinedRegionCost += computeBBInlineCost(&BB, ClonedFuncTTI);
1239  }
1240 
1241  // Extract the body of the if.
1242  CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1243  Function *OutlinedFunc =
1244  CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
1245  ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
1246  /* AllowVarargs */ true)
1247  .extractCodeRegion(CEAC);
1248 
1249  if (OutlinedFunc) {
1250  BasicBlock *OutliningCallBB =
1251  PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->getParent();
1252  assert(OutliningCallBB->getParent() == ClonedFunc);
1253  OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1254  } else
1255  ORE.emit([&]() {
1256  return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1257  &ToExtract.front()->front())
1258  << "Failed to extract region at block "
1259  << ore::NV("Block", ToExtract.front());
1260  });
1261 
1262  return OutlinedFunc;
1263 }
1264 
1265 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1266  // Ditch the duplicate, since we're done with it, and rewrite all remaining
1267  // users (function pointers, etc.) back to the original function.
1268  ClonedFunc->replaceAllUsesWith(OrigFunc);
1269  ClonedFunc->eraseFromParent();
1270  if (!IsFunctionInlined) {
1271  // Remove each function that was speculatively created if there is no
1272  // reference.
1273  for (auto FuncBBPair : OutlinedFunctions) {
1274  Function *Func = FuncBBPair.first;
1275  Func->eraseFromParent();
1276  }
1277  }
1278 }
1279 
1280 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &F) {
1281  if (F.hasAddressTaken())
1282  return {false, nullptr};
1283 
1284  // Let inliner handle it
1285  if (F.hasFnAttribute(Attribute::AlwaysInline))
1286  return {false, nullptr};
1287 
1288  if (F.hasFnAttribute(Attribute::NoInline))
1289  return {false, nullptr};
1290 
1291  if (PSI.isFunctionEntryCold(&F))
1292  return {false, nullptr};
1293 
1294  if (F.users().empty())
1295  return {false, nullptr};
1296 
1298 
1299  // Only try to outline cold regions if we have a profile summary, which
1300  // implies we have profiling information.
1301  if (PSI.hasProfileSummary() && F.hasProfileData() &&
1303  std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1304  computeOutliningColdRegionsInfo(F, ORE);
1305  if (OMRI) {
1306  FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI);
1307 
1308  LLVM_DEBUG({
1309  dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n";
1310  dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold()
1311  << "\n";
1312  });
1313 
1314  bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1315 
1316  if (DidOutline) {
1317  LLVM_DEBUG({
1318  dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1319  Cloner.ClonedFunc->print(dbgs());
1320  dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1321  });
1322 
1323  if (tryPartialInline(Cloner))
1324  return {true, nullptr};
1325  }
1326  }
1327  }
1328 
1329  // Fall-thru to regular partial inlining if we:
1330  // i) can't find any cold regions to outline, or
1331  // ii) can't inline the outlined function anywhere.
1332  std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1333  if (!OI)
1334  return {false, nullptr};
1335 
1336  FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI);
1337  Cloner.normalizeReturnBlock();
1338 
1339  Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1340 
1341  if (!OutlinedFunction)
1342  return {false, nullptr};
1343 
1344  if (tryPartialInline(Cloner))
1345  return {true, OutlinedFunction};
1346 
1347  return {false, nullptr};
1348 }
1349 
1350 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1351  if (Cloner.OutlinedFunctions.empty())
1352  return false;
1353 
1354  auto OutliningCosts = computeOutliningCosts(Cloner);
1355 
1356  InstructionCost SizeCost = std::get<0>(OutliningCosts);
1357  InstructionCost NonWeightedRcost = std::get<1>(OutliningCosts);
1358 
1359  assert(SizeCost.isValid() && NonWeightedRcost.isValid() &&
1360  "Expected valid costs");
1361 
1362  // Only calculate RelativeToEntryFreq when we are doing single region
1363  // outlining.
1364  BranchProbability RelativeToEntryFreq;
1365  if (Cloner.ClonedOI)
1366  RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1367  else
1368  // RelativeToEntryFreq doesn't make sense when we have more than one
1369  // outlined call because each call will have a different relative frequency
1370  // to the entry block. We can consider using the average, but the
1371  // usefulness of that information is questionable. For now, assume we never
1372  // execute the calls to outlined functions.
1373  RelativeToEntryFreq = BranchProbability(0, 1);
1374 
1375  BlockFrequency WeightedRcost =
1376  BlockFrequency(*NonWeightedRcost.getValue()) * RelativeToEntryFreq;
1377 
1378  // The call sequence(s) to the outlined function(s) are larger than the sum of
1379  // the original outlined region size(s), it does not increase the chances of
1380  // inlining the function with outlining (The inliner uses the size increase to
1381  // model the cost of inlining a callee).
1382  if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1383  OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1384  DebugLoc DLoc;
1385  BasicBlock *Block;
1386  std::tie(DLoc, Block) = getOneDebugLoc(*Cloner.ClonedFunc);
1387  OrigFuncORE.emit([&]() {
1388  return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
1389  DLoc, Block)
1390  << ore::NV("Function", Cloner.OrigFunc)
1391  << " not partially inlined into callers (Original Size = "
1392  << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1393  << ", Size of call sequence to outlined function = "
1394  << ore::NV("NewSize", SizeCost) << ")";
1395  });
1396  return false;
1397  }
1398 
1399  assert(Cloner.OrigFunc->users().empty() &&
1400  "F's users should all be replaced!");
1401 
1402  std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1403  Cloner.ClonedFunc->user_end());
1404 
1405  DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1406  auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1407  if (CalleeEntryCount)
1408  computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1409 
1410  uint64_t CalleeEntryCountV =
1411  (CalleeEntryCount ? CalleeEntryCount->getCount() : 0);
1412 
1413  bool AnyInline = false;
1414  for (User *User : Users) {
1415  // Don't bother with BlockAddress used by CallBr for asm goto.
1416  if (isa<BlockAddress>(User))
1417  continue;
1418 
1419  CallBase *CB = getSupportedCallBase(User);
1420 
1421  if (isLimitReached())
1422  continue;
1423 
1424  OptimizationRemarkEmitter CallerORE(CB->getCaller());
1425  if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE))
1426  continue;
1427 
1428  // Construct remark before doing the inlining, as after successful inlining
1429  // the callsite is removed.
1430  OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB);
1431  OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
1432  << ore::NV("Caller", CB->getCaller());
1433 
1434  InlineFunctionInfo IFI(nullptr, GetAssumptionCache, &PSI);
1435  // We can only forward varargs when we outlined a single region, else we
1436  // bail on vararg functions.
1437  if (!InlineFunction(*CB, IFI, /*MergeAttributes=*/false, nullptr, true,
1438  (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1439  : nullptr))
1440  .isSuccess())
1441  continue;
1442 
1443  CallerORE.emit(OR);
1444 
1445  // Now update the entry count:
1446  if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
1447  uint64_t CallSiteCount = CallSiteToProfCountMap[User];
1448  CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1449  }
1450 
1451  AnyInline = true;
1452  NumPartialInlining++;
1453  // Update the stats
1454  if (Cloner.ClonedOI)
1455  NumPartialInlined++;
1456  else
1457  NumColdOutlinePartialInlined++;
1458  }
1459 
1460  if (AnyInline) {
1461  Cloner.IsFunctionInlined = true;
1462  if (CalleeEntryCount)
1463  Cloner.OrigFunc->setEntryCount(Function::ProfileCount(
1464  CalleeEntryCountV, CalleeEntryCount->getType()));
1465  OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1466  OrigFuncORE.emit([&]() {
1467  return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1468  << "Partially inlined into at least one caller";
1469  });
1470  }
1471 
1472  return AnyInline;
1473 }
1474 
1477  return false;
1478 
1479  std::vector<Function *> Worklist;
1480  Worklist.reserve(M.size());
1481  for (Function &F : M)
1482  if (!F.use_empty() && !F.isDeclaration())
1483  Worklist.push_back(&F);
1484 
1485  bool Changed = false;
1486  while (!Worklist.empty()) {
1487  Function *CurrFunc = Worklist.back();
1488  Worklist.pop_back();
1489 
1490  if (CurrFunc->use_empty())
1491  continue;
1492 
1493  bool Recursive = false;
1494  for (User *U : CurrFunc->users())
1495  if (Instruction *I = dyn_cast<Instruction>(U))
1496  if (I->getParent()->getParent() == CurrFunc) {
1497  Recursive = true;
1498  break;
1499  }
1500  if (Recursive)
1501  continue;
1502 
1503  std::pair<bool, Function *> Result = unswitchFunction(*CurrFunc);
1504  if (Result.second)
1505  Worklist.push_back(Result.second);
1506  Changed |= Result.first;
1507  }
1508 
1509  return Changed;
1510 }
1511 
1513 
1514 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
1515  "Partial Inliner", false, false)
1520 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
1521  "Partial Inliner", false, false)
1522 
1524  return new PartialInlinerLegacyPass();
1525 }
1526 
1528  ModuleAnalysisManager &AM) {
1529  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1530 
1531  auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & {
1532  return FAM.getResult<AssumptionAnalysis>(F);
1533  };
1534 
1535  auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
1537  };
1538 
1539  auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
1541  };
1542 
1543  auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
1544  return FAM.getResult<TargetIRAnalysis>(F);
1545  };
1546 
1547  auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
1549  };
1550 
1552 
1553  if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
1554  GetTLI, PSI, GetBFI)
1555  .run(M))
1556  return PreservedAnalyses::none();
1557  return PreservedAnalyses::all();
1558 }
llvm::InstructionCost
Definition: InstructionCost.h:30
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
llvm::InlineCost::isAlways
bool isAlways() const
Definition: InlineCost.h:137
llvm::InlineCost::getCost
int getCost() const
Get the inline cost estimate.
Definition: InlineCost.h:143
llvm::BasicBlock::end
iterator end()
Definition: BasicBlock.h:308
llvm::TargetIRAnalysis
Analysis pass providing the TargetTransformInfo.
Definition: TargetTransformInfo.h:2586
llvm::OptimizationRemarkMissed
Diagnostic information for missed-optimization remarks.
Definition: DiagnosticInfo.h:734
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
Optional.h
ValueMapper.h
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:667
llvm::AArch64PACKey::ID
ID
Definition: AArch64BaseInfo.h:818
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:87
PHI
Rewrite undef for PHI
Definition: AMDGPURewriteUndefForPHI.cpp:101
llvm::ModulePass
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition: Pass.h:248
llvm::InstructionCost::map
auto map(const Function &F) const -> InstructionCost
Definition: InstructionCost.h:246
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:104
IntrinsicInst.h
llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:774
PartialInlining.h
llvm::succ_end
Interval::succ_iterator succ_end(Interval *I)
Definition: Interval.h:102
llvm::Function
Definition: Function.h:60
DEBUG_TYPE
#define DEBUG_TYPE
Definition: PartialInlining.cpp:65
Pass.h
llvm::CodeExtractor::extractCodeRegion
Function * extractCodeRegion(const CodeExtractorAnalysisCache &CEAC)
Perform the extraction, returning the new function.
Definition: CodeExtractor.cpp:1628
llvm::SetVector::size
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:77
llvm::TargetTransformInfoWrapperPass::getTTI
TargetTransformInfo & getTTI(const Function &F)
Definition: TargetTransformInfo.cpp:1222
llvm::ProfileSummaryInfo::hasProfileSummary
bool hasProfileSummary() const
Returns true if profile summary is available.
Definition: ProfileSummaryInfo.h:69
llvm::InlineFunction
InlineResult InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, bool MergeAttributes=false, AAResults *CalleeAAR=nullptr, bool InsertLifetime=true, Function *ForwardVarArgsTo=nullptr)
This function inlines the called function into the basic block of the caller.
Definition: InlineFunction.cpp:2046
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1199
Statistic.h
output
Current output
Definition: README.txt:1350
ErrorHandling.h
llvm::TargetTransformInfo
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Definition: TargetTransformInfo.h:173
llvm::TargetTransformInfo::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
Definition: TargetTransformInfo.cpp:968
llvm::PHINode::removeIncomingValue
Value * removeIncomingValue(unsigned Idx, bool DeletePHIIfEmpty=true)
Remove an incoming value.
Definition: Instructions.cpp:115
llvm::CallingConv::Cold
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
OptimizationRemarkEmitter.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
FAM
FunctionAnalysisManager FAM
Definition: PassBuilderBindings.cpp:59
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:140
MinBlockCounterExecution
static cl::opt< unsigned > MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, cl::desc("Minimum block executions to consider " "its BranchProbabilityInfo valid"))
llvm::PreservedAnalyses::none
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: PassManager.h:155
BlockFrequency.h
DenseMap.h
Module.h
llvm::sys::path::end
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:235
llvm::sys::path::begin
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
llvm::BasicBlock::getSingleSuccessor
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:315
llvm::ProfileSummaryInfo::isFunctionEntryCold
bool isFunctionEntryCold(const Function *F) const
Returns true if F has cold function entry.
Definition: ProfileSummaryInfo.cpp:223
llvm::isInlineViable
InlineResult isInlineViable(Function &Callee)
Minimal filter to detect invalid constructs for inlining.
Definition: InlineCost.cpp:2989
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::count
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:145
llvm::BranchProbability::getBranchProbability
static BranchProbability getBranchProbability(uint64_t Numerator, uint64_t Denominator)
Definition: BranchProbability.cpp:53
T
#define T
Definition: Mips16ISelLowering.cpp:341
llvm::ore::NV
DiagnosticInfoOptimizationBase::Argument NV
Definition: OptimizationRemarkEmitter.h:136
Operator.h
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:397
CodeExtractor.h
llvm::successors
auto successors(MachineBasicBlock *BB)
Definition: MachineSSAContext.h:29
llvm::max
Expected< ExpressionValue > max(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:337
llvm::BasicBlock::getSinglePredecessor
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:285
STLExtras.h
llvm::InlineCost::isNever
bool isNever() const
Definition: InlineCost.h:138
llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:21
llvm::cl::ReallyHidden
@ ReallyHidden
Definition: CommandLine.h:141
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
F
#define F(x, y, z)
Definition: MD5.cpp:55
ColdBranchRatio
static cl::opt< float > ColdBranchRatio("cold-branch-ratio", cl::init(0.1), cl::Hidden, cl::desc("Minimum BranchProbability to consider a region cold."))
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:55
InstrCost
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
Instruction.h
CommandLine.h
llvm::BlockFrequencyInfo
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Definition: BlockFrequencyInfo.h:37
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:24
llvm::getInlineCost
InlineCost getInlineCost(CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< const TargetLibraryInfo &(Function &)> GetTLI, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
Get an InlineCost object representing the cost of inlining this callsite.
Definition: InlineCost.cpp:2822
InlinePriorityMode::Cost
@ Cost
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::User
Definition: User.h:44
Intrinsics.h
InstrTypes.h
llvm::CallBase::getCalledFunction
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1397
llvm::BasicBlock::begin
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:306
llvm::BranchProbabilityInfo
Analysis providing branch probability information.
Definition: BranchProbabilityInfo.h:113
SI
@ SI
Definition: SIInstrInfo.cpp:7966
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::InlineCost
Represents the cost of inlining a function.
Definition: InlineCost.h:90
llvm::ms_demangle::QualifierMangleMode::Result
@ Result
TargetLibraryInfo.h
DenseSet.h
false
Definition: StackSlotColoring.cpp:141
llvm::InlineConstants::getInstrCost
int getInstrCost()
Definition: InlineCost.cpp:183
llvm::succ_size
unsigned succ_size(MachineBasicBlock *BB)
Definition: MachineSSAContext.h:31
llvm::AssumptionCacheTracker::lookupAssumptionCache
AssumptionCache * lookupAssumptionCache(Function &F)
Return the cached assumptions for a function if it has already been scanned.
Definition: AssumptionCache.cpp:305
llvm::PHINode::getIncomingValueForBlock
Value * getIncomingValueForBlock(const BasicBlock *BB) const
Definition: Instructions.h:2884
llvm::pdb::PDB_SymType::Caller
@ Caller
llvm::Instruction
Definition: Instruction.h:42
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
llvm::predecessors
auto predecessors(MachineBasicBlock *BB)
Definition: MachineSSAContext.h:30
llvm::InlineResult::isSuccess
bool isSuccess() const
Definition: InlineCost.h:187
DebugLoc.h
llvm::getInlineParams
InlineParams getInlineParams()
Generate the parameters to tune the inline cost analysis based only on the commandline options.
Definition: InlineCost.cpp:3101
llvm::BlockFrequencyAnalysis
Analysis pass which computes BlockFrequencyInfo.
Definition: BlockFrequencyInfo.h:112
llvm::PHINode::getNumIncomingValues
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Definition: Instructions.h:2791
MinRegionSizeRatio
static cl::opt< float > MinRegionSizeRatio("min-region-size-ratio", cl::init(0.1), cl::Hidden, cl::desc("Minimum ratio comparing relative sizes of each " "outline candidate and original function"))
llvm::Value::use_empty
bool use_empty() const
Definition: Value.h:344
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:120
BranchProbability.h
llvm::CallBase::getCaller
Function * getCaller()
Helper to get the caller (the parent function).
Definition: Instructions.cpp:284
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
CFG.h
llvm::dxil::PointerTypeAnalysis::run
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
Definition: PointerTypeAnalysis.cpp:189
LoopInfo.h
llvm::ProfileSummaryInfo
Analysis providing profile information.
Definition: ProfileSummaryInfo.h:40
llvm::InvokeInst
Invoke instruction.
Definition: Instructions.h:3811
llvm::all_equal
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:1985
OutlineRegionFreqPercent
static cl::opt< int > OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), cl::Hidden, cl::desc("Relative frequency of outline region to " "the entry block"))
llvm::function_ref
An efficient, type-erasing, non-owning reference to a callable.
Definition: STLFunctionalExtras.h:36
llvm::Value::user_end
user_iterator user_end()
Definition: Value.h:405
llvm::DenseSet
Implements a dense probed hash-table based set.
Definition: DenseSet.h:268
BasicBlock.h
llvm::cl::opt< bool >
llvm::BlockFrequency
Definition: BlockFrequency.h:23
BranchProbabilityInfo.h
llvm::TargetLibraryInfoWrapperPass
Definition: TargetLibraryInfo.h:475
uint64_t
ProfileSummaryInfo.h
llvm::TargetTransformInfoWrapperPass
Wrapper pass for TargetTransformInfo.
Definition: TargetTransformInfo.h:2642
llvm::AssumptionAnalysis
A function analysis which provides an AssumptionCache.
Definition: AssumptionCache.h:173
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
IPO.h
llvm::PHINode::addIncoming
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Definition: Instructions.h:2849
ProfDataUtils.h
llvm::DenseMap
Definition: DenseMap.h:714
MaxNumPartialInlining
static cl::opt< int > MaxNumPartialInlining("max-partial-inlining", cl::init(-1), cl::Hidden, cl::desc("Max number of partial inlining. The default is unlimited"))
I
#define I(x, y, z)
Definition: MD5.cpp:58
Cloning.h
llvm::succ_begin
Interval::succ_iterator succ_begin(Interval *I)
succ_begin/succ_end - define methods so that Intervals may be used just like BasicBlocks can with the...
Definition: Interval.h:99
llvm::BlockFrequency::getFrequency
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
Definition: BlockFrequency.h:34
MarkOutlinedColdCC
static cl::opt< bool > MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden, cl::desc("Mark outline function calls with ColdCC"))
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:447
llvm::ProfileSummaryInfo::getColdCountThreshold
uint64_t getColdCountThreshold() const
Returns ColdCountThreshold if set.
Definition: ProfileSummaryInfo.h:177
llvm::ProfileSummaryInfoWrapperPass
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Definition: ProfileSummaryInfo.h:194
llvm::InlineCost::getCostDelta
int getCostDelta() const
Get the cost delta from the threshold for inlining.
Definition: InlineCost.h:173
llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1868
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::OptimizationRemarkEmitter::emit
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Definition: OptimizationRemarkEmitter.cpp:77
InlineCost.h
llvm::ProfileSummaryInfo::hasInstrumentationProfile
bool hasInstrumentationProfile() const
Returns true if module M has instrumentation profile.
Definition: ProfileSummaryInfo.h:78
llvm::WinEH::EncodingType::CE
@ CE
Windows NT (Windows on ARM)
llvm::InstructionCost::getValue
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
Definition: InstructionCost.h:88
inliner
partial inliner
Definition: PartialInlining.cpp:1520
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
OtherSucc
static MachineBasicBlock * OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ)
Definition: ARMISelLowering.cpp:11118
DisablePartialInlining
static cl::opt< bool > DisablePartialInlining("disable-partial-inlining", cl::init(false), cl::Hidden, cl::desc("Disable partial inlining"))
hasProfileData
static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI)
Definition: PartialInlining.cpp:712
llvm::ProfileSummaryAnalysis
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Definition: ProfileSummaryInfo.h:212
llvm::initializePartialInlinerLegacyPassPass
void initializePartialInlinerLegacyPassPass(PassRegistry &)
MaxNumInlineBlocks
static cl::opt< unsigned > MaxNumInlineBlocks("max-num-inline-blocks", cl::init(5), cl::Hidden, cl::desc("Max number of blocks to be partially inlined"))
llvm::DominatorTreeBase::recalculate
void recalculate(ParentType &Func)
recalculate - compute a dominator tree for the given function
Definition: GenericDomTree.h:778
llvm::AssumptionCacheTracker
An immutable pass that tracks lazily created AssumptionCache objects.
Definition: AssumptionCache.h:202
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::LoopInfo
Definition: LoopInfo.h:1108
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::Function::setCallingConv
void setCallingConv(CallingConv::ID CC)
Definition: Function.h:242
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:42
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:222
DP
So we should use XX3Form_Rcr to implement intrinsic Convert DP outs ins xscvdpsp No builtin are required Round &Convert QP DP(dword[1] is set to zero) No builtin are required Round to Quad Precision because you need to assign rounding mode in instruction Provide builtin(set f128:$vT,(int_ppc_vsx_xsrqpi f128:$vB))(set f128 yields< n x< ty > >< result > yields< ty >< result > No builtin are required Load Store load store see def memrix16 in PPCInstrInfo td Load Store Vector load store outs ins lxsdx set load store with conversion from to DP
Definition: README_P9.txt:520
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:532
llvm::BranchProbability
Definition: BranchProbability.h:30
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::CodeExtractor
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:85
llvm::RegionInfo
Definition: RegionInfo.h:900
llvm::InstructionCost::isValid
bool isValid() const
Definition: InstructionCost.h:80
llvm::CloneFunction
Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
Definition: CloneFunction.cpp:297
BlockFrequencyInfo.h
llvm::AMDGPUISD::BFI
@ BFI
Definition: AMDGPUISelLowering.h:432
llvm::OptimizationRemarkAnalysis
Diagnostic information for optimization analysis remarks.
Definition: DiagnosticInfo.h:780
llvm::ifs::IFSSymbolType::Func
@ Func
llvm::ValueMap< const Value *, WeakTrackingVH >
Callee
amdgpu Simplify well known AMD library false FunctionCallee Callee
Definition: AMDGPULibCalls.cpp:187
ForceLiveExit
static cl::opt< bool > ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden, cl::desc("Force outline regions with live exits"))
Attributes.h
llvm::AssumptionCacheTracker::getAssumptionCache
AssumptionCache & getAssumptionCache(Function &F)
Get the cached assumptions for a function.
Definition: AssumptionCache.cpp:285
llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:158
llvm::ISD::BR
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:982
llvm::Function::back
const BasicBlock & back() const
Definition: Function.h:716
llvm::AnalysisManager::getCachedResult
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:793
Casting.h
DiagnosticInfo.h
Function.h
SkipCostAnalysis
static cl::opt< bool > SkipCostAnalysis("skip-partial-inlining-cost-analysis", cl::ReallyHidden, cl::desc("Skip Cost Analysis"))
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:226
llvm::getCallsiteCost
int getCallsiteCost(const CallBase &Call, const DataLayout &DL)
Return the cost associated with a callsite, including parameter passing and the call/return instructi...
Definition: InlineCost.cpp:2788
llvm::InlineFunctionInfo
This class captures the data input to the InlineFunction call, and records the auxiliary results prod...
Definition: Cloning.h:203
Inliner
partial Partial Inliner
Definition: PartialInlining.cpp:1521
llvm::CodeExtractorAnalysisCache
A cache for the CodeExtractor analysis.
Definition: CodeExtractor.h:46
llvm::ProfileSummaryInfo::isColdBlock
bool isColdBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI) const
Returns true if BasicBlock BB is considered cold.
Definition: ProfileSummaryInfo.cpp:333
DisableMultiRegionPartialInline
static cl::opt< bool > DisableMultiRegionPartialInline("disable-mr-partial-inlining", cl::init(false), cl::Hidden, cl::desc("Disable multi-region partial inlining"))
llvm::OptimizationRemark
Diagnostic information for applied optimization remarks.
Definition: DiagnosticInfo.h:689
Instructions.h
SmallVector.h
llvm::Instruction::getDebugLoc
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:359
User.h
llvm::BlockFrequencyInfo::getBlockProfileCount
Optional< uint64_t > getBlockProfileCount(const BasicBlock *BB, bool AllowSynthetic=false) const
Returns the estimated profile count of BB.
Definition: BlockFrequencyInfo.cpp:208
Dominators.h
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:91
Users
iv Induction Variable Users
Definition: IVUsers.cpp:48
TargetTransformInfo.h
llvm::DenseMapBase::reserve
void reserve(size_type NumEntries)
Grow the densemap so that it can contain at least NumEntries items before resizing again.
Definition: DenseMap.h:103
llvm::PHINode
Definition: Instructions.h:2699
llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:119
llvm::ProfileSummaryInfo::getHotCountThreshold
uint64_t getHotCountThreshold() const
Returns HotCountThreshold if set.
Definition: ProfileSummaryInfo.h:173
llvm::MipsISD::Ins
@ Ins
Definition: MipsISelLowering.h:160
llvm::SmallVectorImpl< BasicBlock * >
llvm::CallBase
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1175
llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: InstructionSimplify.h:42
llvm::createPartialInliningPass
ModulePass * createPartialInliningPass()
createPartialInliningPass - This pass inlines parts of functions.
Definition: PartialInlining.cpp:1523
llvm::InnerAnalysisManagerProxy
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:931
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1474
llvm::extractBranchWeights
bool extractBranchWeights(const MDNode *ProfileData, SmallVectorImpl< uint32_t > &Weights)
Extract branch weights from MD_prof metadata.
Definition: ProfDataUtils.cpp:104
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", "Partial Inliner", false, false) INITIALIZE_PASS_END(PartialInlinerLegacyPass
ExtraOutliningPenalty
static cl::opt< unsigned > ExtraOutliningPenalty("partial-inlining-extra-penalty", cl::init(0), cl::Hidden, cl::desc("A debug option to add additional penalty to the computed one."))
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::SwitchInst
Multiway switch.
Definition: Instructions.h:3278
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::Function::ProfileCount
Class to represent profile counts.
Definition: Function.h:253
llvm::cl::desc
Definition: CommandLine.h:413
llvm::Region
Definition: RegionInfo.h:889
llvm::BranchInst
Conditional or Unconditional Branch instruction.
Definition: Instructions.h:3134
llvm::SetVector< Value * >
llvm::pdb::PDB_SymType::Block
@ Block
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::TargetLibraryAnalysis
Analysis pass providing the TargetLibraryInfo.
Definition: TargetLibraryInfo.h:450
llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:421
llvm::CallBase::setCallingConv
void setCallingConv(CallingConv::ID CC)
Definition: InstrTypes.h:1460
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:39