LLVM  14.0.0git
AMDGPUAlwaysInlinePass.cpp
Go to the documentation of this file.
1 //===-- AMDGPUAlwaysInlinePass.cpp - Promote Allocas ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass marks all internal functions as always_inline and creates
11 /// duplicates of all other functions and marks the duplicates as always_inline.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "Utils/AMDGPUBaseInfo.h"
19 #include "llvm/IR/Module.h"
20 #include "llvm/Pass.h"
22 
23 using namespace llvm;
24 
25 namespace {
26 
27 static cl::opt<bool> StressCalls(
28  "amdgpu-stress-function-calls",
29  cl::Hidden,
30  cl::desc("Force all functions to be noinline"),
31  cl::init(false));
32 
33 class AMDGPUAlwaysInline : public ModulePass {
34  bool GlobalOpt;
35 
36 public:
37  static char ID;
38 
39  AMDGPUAlwaysInline(bool GlobalOpt = false) :
40  ModulePass(ID), GlobalOpt(GlobalOpt) { }
41  bool runOnModule(Module &M) override;
42 
43  void getAnalysisUsage(AnalysisUsage &AU) const override {
44  AU.setPreservesAll();
45  }
46 };
47 
48 } // End anonymous namespace
49 
50 INITIALIZE_PASS(AMDGPUAlwaysInline, "amdgpu-always-inline",
51  "AMDGPU Inline All Functions", false, false)
52 
53 char AMDGPUAlwaysInline::ID = 0;
54 
55 static void
57  SmallPtrSetImpl<Function *> &FuncsToAlwaysInline) {
58  SmallVector<User *, 16> Stack(GV.users());
59 
61 
62  while (!Stack.empty()) {
63  User *U = Stack.pop_back_val();
64  if (!Visited.insert(U).second)
65  continue;
66 
67  if (Instruction *I = dyn_cast<Instruction>(U)) {
68  Function *F = I->getParent()->getParent();
69  if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
70  // FIXME: This is a horrible hack. We should always respect noinline,
71  // and just let us hit the error when we can't handle this.
72  //
73  // Unfortunately, clang adds noinline to all functions at -O0. We have
74  // to override this here until that's fixed.
75  F->removeFnAttr(Attribute::NoInline);
76 
77  FuncsToAlwaysInline.insert(F);
78  Stack.push_back(F);
79  }
80 
81  // No need to look at further users, but we do need to inline any callers.
82  continue;
83  }
84 
85  append_range(Stack, U->users());
86  }
87 }
88 
89 static bool alwaysInlineImpl(Module &M, bool GlobalOpt) {
90  std::vector<GlobalAlias*> AliasesToRemove;
91 
92  SmallPtrSet<Function *, 8> FuncsToAlwaysInline;
93  SmallPtrSet<Function *, 8> FuncsToNoInline;
94  Triple TT(M.getTargetTriple());
95 
96  for (GlobalAlias &A : M.aliases()) {
97  if (Function* F = dyn_cast<Function>(A.getAliasee())) {
98  if (TT.getArch() == Triple::amdgcn &&
99  A.getLinkage() != GlobalValue::InternalLinkage)
100  continue;
101  A.replaceAllUsesWith(F);
102  AliasesToRemove.push_back(&A);
103  }
104 
105  // FIXME: If the aliasee isn't a function, it's some kind of constant expr
106  // cast that won't be inlined through.
107  }
108 
109  if (GlobalOpt) {
110  for (GlobalAlias* A : AliasesToRemove) {
111  A->eraseFromParent();
112  }
113  }
114 
115  // Always force inlining of any function that uses an LDS global address. This
116  // is something of a workaround because we don't have a way of supporting LDS
117  // objects defined in functions. LDS is always allocated by a kernel, and it
118  // is difficult to manage LDS usage if a function may be used by multiple
119  // kernels.
120  //
121  // OpenCL doesn't allow declaring LDS in non-kernels, so in practice this
122  // should only appear when IPO passes manages to move LDs defined in a kernel
123  // into a single user function.
124 
125  for (GlobalVariable &GV : M.globals()) {
126  // TODO: Region address
127  unsigned AS = GV.getAddressSpace();
128  if ((AS == AMDGPUAS::REGION_ADDRESS) ||
129  (AS == AMDGPUAS::LOCAL_ADDRESS &&
130  (!AMDGPUTargetMachine::EnableLowerModuleLDS || !GV.hasInitializer())))
131  recursivelyVisitUsers(GV, FuncsToAlwaysInline);
132  }
133 
134  if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) {
135  auto IncompatAttr
136  = StressCalls ? Attribute::AlwaysInline : Attribute::NoInline;
137 
138  for (Function &F : M) {
139  if (!F.isDeclaration() && !F.use_empty() &&
140  !F.hasFnAttribute(IncompatAttr)) {
141  if (StressCalls) {
142  if (!FuncsToAlwaysInline.count(&F))
143  FuncsToNoInline.insert(&F);
144  } else
145  FuncsToAlwaysInline.insert(&F);
146  }
147  }
148  }
149 
150  for (Function *F : FuncsToAlwaysInline)
151  F->addFnAttr(Attribute::AlwaysInline);
152 
153  for (Function *F : FuncsToNoInline)
154  F->addFnAttr(Attribute::NoInline);
155 
156  return !FuncsToAlwaysInline.empty() || !FuncsToNoInline.empty();
157 }
158 
159 bool AMDGPUAlwaysInline::runOnModule(Module &M) {
160  return alwaysInlineImpl(M, GlobalOpt);
161 }
162 
164  return new AMDGPUAlwaysInline(GlobalOpt);
165 }
166 
168  ModuleAnalysisManager &AM) {
169  alwaysInlineImpl(M, GlobalOpt);
170  return PreservedAnalyses::all();
171 }
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:155
llvm
This file implements support for optimizing divisions by a constant.
Definition: AllocatorList.h:23
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::ModulePass
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition: Pass.h:238
llvm::Function
Definition: Function.h:62
Pass.h
recursivelyVisitUsers
static INITIALIZE_PASS(AMDGPUAlwaysInline, "amdgpu-always-inline", "AMDGPU Inline All Functions", false, false) char AMDGPUAlwaysInline void recursivelyVisitUsers(GlobalValue &GV, SmallPtrSetImpl< Function * > &FuncsToAlwaysInline)
Definition: AMDGPUAlwaysInlinePass.cpp:56
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::Triple::amdgcn
@ amdgcn
Definition: Triple.h:72
llvm::GlobalVariable
Definition: GlobalVariable.h:40
llvm::GlobalAlias
Definition: GlobalAlias.h:27
llvm::Triple
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:45
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
Module.h
llvm::SmallPtrSet< const Value *, 8 >
F
#define F(x, y, z)
Definition: MD5.cpp:56
CommandLine.h
llvm::User
Definition: User.h:44
INITIALIZE_PASS
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:37
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:363
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::Instruction
Definition: Instruction.h:45
llvm::GlobalValue::InternalLinkage
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:55
alwaysInlineImpl
static bool alwaysInlineImpl(Module &M, bool GlobalOpt)
Definition: AMDGPUAlwaysInlinePass.cpp:89
llvm::AMDGPU::isEntryFunctionCC
bool isEntryFunctionCC(CallingConv::ID CC)
Definition: AMDGPUBaseInfo.cpp:1383
llvm::createAMDGPUAlwaysInlinePass
ModulePass * createAMDGPUAlwaysInlinePass(bool GlobalOpt=true)
Definition: AMDGPUAlwaysInlinePass.cpp:163
llvm::cl::opt< bool >
llvm::GlobalValue
Definition: GlobalValue.h:44
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:441
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:360
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
llvm::AMDGPUTargetMachine::EnableLowerModuleLDS
static bool EnableLowerModuleLDS
Definition: AMDGPUTargetMachine.h:40
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
llvm::AMDGPUAlwaysInlinePass::run
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
Definition: AMDGPUAlwaysInlinePass.cpp:167
AMDGPU.h
llvm::append_range
void append_range(Container &C, Range &&R)
Wrapper function to append a range to a container.
Definition: STLExtras.h:1748
llvm::AnalysisUsage::setPreservesAll
void setPreservesAll()
Set by analyses that do not transform their input at all.
Definition: PassAnalysisSupport.h:130
llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:161
llvm::AMDGPUTargetMachine::EnableFunctionCalls
static bool EnableFunctionCalls
Definition: AMDGPUTargetMachine.h:38
llvm::SmallPtrSetImpl
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:343
llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: InstructionSimplify.h:44
llvm::cl::desc
Definition: CommandLine.h:412
CommandFlags.h
llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:422
AMDGPUTargetMachine.h
AMDGPUBaseInfo.h
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37