LLVM  13.0.0git
AMDGPUAlwaysInlinePass.cpp
Go to the documentation of this file.
1 //===-- AMDGPUAlwaysInlinePass.cpp - Promote Allocas ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass marks all internal functions as always_inline and creates
11 /// duplicates of all other functions and marks the duplicates as always_inline.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "Utils/AMDGPUBaseInfo.h"
18 #include "llvm/IR/Module.h"
19 #include "llvm/Pass.h"
21 
22 using namespace llvm;
23 
24 namespace {
25 
26 static cl::opt<bool> StressCalls(
27  "amdgpu-stress-function-calls",
28  cl::Hidden,
29  cl::desc("Force all functions to be noinline"),
30  cl::init(false));
31 
32 class AMDGPUAlwaysInline : public ModulePass {
33  bool GlobalOpt;
34 
35 public:
36  static char ID;
37 
38  AMDGPUAlwaysInline(bool GlobalOpt = false) :
39  ModulePass(ID), GlobalOpt(GlobalOpt) { }
40  bool runOnModule(Module &M) override;
41 
42  void getAnalysisUsage(AnalysisUsage &AU) const override {
43  AU.setPreservesAll();
44  }
45 };
46 
47 } // End anonymous namespace
48 
49 INITIALIZE_PASS(AMDGPUAlwaysInline, "amdgpu-always-inline",
50  "AMDGPU Inline All Functions", false, false)
51 
52 char AMDGPUAlwaysInline::ID = 0;
53 
54 static void
56  SmallPtrSetImpl<Function *> &FuncsToAlwaysInline) {
57  SmallVector<User *, 16> Stack(GV.users());
58 
60 
61  while (!Stack.empty()) {
62  User *U = Stack.pop_back_val();
63  if (!Visited.insert(U).second)
64  continue;
65 
66  if (Instruction *I = dyn_cast<Instruction>(U)) {
67  Function *F = I->getParent()->getParent();
68  if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
69  // FIXME: This is a horrible hack. We should always respect noinline,
70  // and just let us hit the error when we can't handle this.
71  //
72  // Unfortunately, clang adds noinline to all functions at -O0. We have
73  // to override this here. until that's fixed.
74  F->removeFnAttr(Attribute::NoInline);
75 
76  FuncsToAlwaysInline.insert(F);
77  Stack.push_back(F);
78  }
79 
80  // No need to look at further users, but we do need to inline any callers.
81  continue;
82  }
83 
84  append_range(Stack, U->users());
85  }
86 }
87 
88 static bool alwaysInlineImpl(Module &M, bool GlobalOpt) {
89  std::vector<GlobalAlias*> AliasesToRemove;
90 
91  SmallPtrSet<Function *, 8> FuncsToAlwaysInline;
92  SmallPtrSet<Function *, 8> FuncsToNoInline;
93 
94  for (GlobalAlias &A : M.aliases()) {
95  if (Function* F = dyn_cast<Function>(A.getAliasee())) {
96  A.replaceAllUsesWith(F);
97  AliasesToRemove.push_back(&A);
98  }
99 
100  // FIXME: If the aliasee isn't a function, it's some kind of constant expr
101  // cast that won't be inlined through.
102  }
103 
104  if (GlobalOpt) {
105  for (GlobalAlias* A : AliasesToRemove) {
106  A->eraseFromParent();
107  }
108  }
109 
110  // Always force inlining of any function that uses an LDS global address. This
111  // is something of a workaround because we don't have a way of supporting LDS
112  // objects defined in functions. LDS is always allocated by a kernel, and it
113  // is difficult to manage LDS usage if a function may be used by multiple
114  // kernels.
115  //
116  // OpenCL doesn't allow declaring LDS in non-kernels, so in practice this
117  // should only appear when IPO passes manages to move LDs defined in a kernel
118  // into a single user function.
119 
120  for (GlobalVariable &GV : M.globals()) {
121  // TODO: Region address
122  unsigned AS = GV.getAddressSpace();
123  if ((AS == AMDGPUAS::REGION_ADDRESS) ||
124  (AS == AMDGPUAS::LOCAL_ADDRESS &&
126  recursivelyVisitUsers(GV, FuncsToAlwaysInline);
127  }
128 
129  if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) {
130  auto IncompatAttr
131  = StressCalls ? Attribute::AlwaysInline : Attribute::NoInline;
132 
133  for (Function &F : M) {
134  if (!F.isDeclaration() && !F.use_empty() &&
135  !F.hasFnAttribute(IncompatAttr)) {
136  if (StressCalls) {
137  if (!FuncsToAlwaysInline.count(&F))
138  FuncsToNoInline.insert(&F);
139  } else
140  FuncsToAlwaysInline.insert(&F);
141  }
142  }
143  }
144 
145  for (Function *F : FuncsToAlwaysInline)
146  F->addFnAttr(Attribute::AlwaysInline);
147 
148  for (Function *F : FuncsToNoInline)
149  F->addFnAttr(Attribute::NoInline);
150 
151  return !FuncsToAlwaysInline.empty() || !FuncsToNoInline.empty();
152 }
153 
154 bool AMDGPUAlwaysInline::runOnModule(Module &M) {
155  return alwaysInlineImpl(M, GlobalOpt);
156 }
157 
159  return new AMDGPUAlwaysInline(GlobalOpt);
160 }
161 
163  ModuleAnalysisManager &AM) {
164  alwaysInlineImpl(M, GlobalOpt);
165  return PreservedAnalyses::all();
166 }
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:155
llvm
Definition: AllocatorList.h:23
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::ModulePass
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition: Pass.h:238
llvm::Function
Definition: Function.h:61
Pass.h
recursivelyVisitUsers
static INITIALIZE_PASS(AMDGPUAlwaysInline, "amdgpu-always-inline", "AMDGPU Inline All Functions", false, false) char AMDGPUAlwaysInline void recursivelyVisitUsers(GlobalValue &GV, SmallPtrSetImpl< Function * > &FuncsToAlwaysInline)
Definition: AMDGPUAlwaysInlinePass.cpp:55
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1167
llvm::GlobalVariable
Definition: GlobalVariable.h:40
llvm::GlobalAlias
Definition: GlobalAlias.h:27
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
Module.h
llvm::SmallPtrSet< const Value *, 8 >
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:361
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:364
F
#define F(x, y, z)
Definition: MD5.cpp:56
CommandLine.h
llvm::User
Definition: User.h:44
INITIALIZE_PASS
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:37
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::Instruction
Definition: Instruction.h:45
alwaysInlineImpl
static bool alwaysInlineImpl(Module &M, bool GlobalOpt)
Definition: AMDGPUAlwaysInlinePass.cpp:88
llvm::AMDGPU::isEntryFunctionCC
bool isEntryFunctionCC(CallingConv::ID CC)
Definition: AMDGPUBaseInfo.cpp:1370
llvm::createAMDGPUAlwaysInlinePass
ModulePass * createAMDGPUAlwaysInlinePass(bool GlobalOpt=true)
Definition: AMDGPUAlwaysInlinePass.cpp:158
llvm::cl::opt< bool >
llvm::GlobalValue
Definition: GlobalValue.h:44
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
llvm::AMDGPUTargetMachine::EnableLowerModuleLDS
static bool EnableLowerModuleLDS
Definition: AMDGPUTargetMachine.h:38
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
llvm::AMDGPUAlwaysInlinePass::run
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
Definition: AMDGPUAlwaysInlinePass.cpp:162
AMDGPU.h
llvm::append_range
void append_range(Container &C, Range &&R)
Wrapper function to append a range to a container.
Definition: STLExtras.h:1672
llvm::AnalysisUsage::setPreservesAll
void setPreservesAll()
Set by analyses that do not transform their input at all.
Definition: PassAnalysisSupport.h:130
llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:161
llvm::AMDGPUTargetMachine::EnableFunctionCalls
static bool EnableFunctionCalls
Definition: AMDGPUTargetMachine.h:36
llvm::SmallPtrSetImpl
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:343
llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: InstructionSimplify.h:44
llvm::cl::desc
Definition: CommandLine.h:414
llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:422
AMDGPUTargetMachine.h
AMDGPUBaseInfo.h
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38