LLVM  14.0.0git
AMDGPULDSUtils.cpp
Go to the documentation of this file.
1 //===- AMDGPULDSUtils.cpp -------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // AMDGPU LDS related helper utility functions.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AMDGPULDSUtils.h"
14 #include "AMDGPU.h"
15 #include "Utils/AMDGPUBaseInfo.h"
17 #include "llvm/ADT/SetVector.h"
19 #include "llvm/IR/Constants.h"
21 
22 using namespace llvm;
23 
24 namespace llvm {
25 
26 namespace AMDGPU {
27 
28 // An helper class for collecting all reachable callees for each kernel defined
29 // within the module.
31  Module &M;
32  CallGraph CG;
33  SmallPtrSet<CallGraphNode *, 8> AddressTakenFunctions;
34 
35  // Collect all address taken functions within the module.
36  void collectAddressTakenFunctions() {
37  auto *ECNode = CG.getExternalCallingNode();
38 
39  for (auto GI = ECNode->begin(), GE = ECNode->end(); GI != GE; ++GI) {
40  auto *CGN = GI->second;
41  auto *F = CGN->getFunction();
42  if (!F || F->isDeclaration() || AMDGPU::isKernelCC(F))
43  continue;
44  AddressTakenFunctions.insert(CGN);
45  }
46  }
47 
48  // For given kernel, collect all its reachable non-kernel functions.
49  SmallPtrSet<Function *, 8> collectReachableCallees(Function *K) {
50  SmallPtrSet<Function *, 8> ReachableCallees;
51 
52  // Call graph node which represents this kernel.
53  auto *KCGN = CG[K];
54 
55  // Go through all call graph nodes reachable from the node representing this
56  // kernel, visit all their call sites, if the call site is direct, add
57  // corresponding callee to reachable callee set, if it is indirect, resolve
58  // the indirect call site to potential reachable callees, add them to
59  // reachable callee set, and repeat the process for the newly added
60  // potential callee nodes.
61  //
62  // FIXME: Need to handle bit-casted function pointers.
63  //
64  SmallVector<CallGraphNode *, 8> CGNStack(df_begin(KCGN), df_end(KCGN));
65  SmallPtrSet<CallGraphNode *, 8> VisitedCGNodes;
66  while (!CGNStack.empty()) {
67  auto *CGN = CGNStack.pop_back_val();
68 
69  if (!VisitedCGNodes.insert(CGN).second)
70  continue;
71 
72  // Ignore call graph node which does not have associated function or
73  // associated function is not a definition.
74  if (!CGN->getFunction() || CGN->getFunction()->isDeclaration())
75  continue;
76 
77  for (auto GI = CGN->begin(), GE = CGN->end(); GI != GE; ++GI) {
78  auto *RCB = cast<CallBase>(GI->first.getValue());
79  auto *RCGN = GI->second;
80 
81  if (auto *DCallee = RCGN->getFunction()) {
82  ReachableCallees.insert(DCallee);
83  } else if (RCB->isIndirectCall()) {
84  auto *RCBFTy = RCB->getFunctionType();
85  for (auto *ACGN : AddressTakenFunctions) {
86  auto *ACallee = ACGN->getFunction();
87  if (ACallee->getFunctionType() == RCBFTy) {
88  ReachableCallees.insert(ACallee);
89  CGNStack.append(df_begin(ACGN), df_end(ACGN));
90  }
91  }
92  }
93  }
94  }
95 
96  return ReachableCallees;
97  }
98 
99 public:
100  explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) {
101  // Collect address taken functions.
102  collectAddressTakenFunctions();
103  }
104 
106  DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
107  // Collect reachable callee set for each kernel defined in the module.
108  for (Function &F : M.functions()) {
109  if (!AMDGPU::isKernelCC(&F))
110  continue;
111  Function *K = &F;
112  KernelToCallees[K] = collectReachableCallees(K);
113  }
114  }
115 };
116 
118  Module &M,
119  DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
121  CRC.collectReachableCallees(KernelToCallees);
122 }
123 
125  SmallPtrSet<Function *, 8> LDSAccessors;
126  SmallVector<User *, 8> UserStack(GV->users());
127  SmallPtrSet<User *, 8> VisitedUsers;
128 
129  while (!UserStack.empty()) {
130  auto *U = UserStack.pop_back_val();
131 
132  // `U` is already visited? continue to next one.
133  if (!VisitedUsers.insert(U).second)
134  continue;
135 
136  // `U` is a global variable which is initialized with LDS. Ignore LDS.
137  if (isa<GlobalValue>(U))
139 
140  // Recursively explore constant users.
141  if (isa<Constant>(U)) {
142  append_range(UserStack, U->users());
143  continue;
144  }
145 
146  // `U` should be an instruction, if it belongs to a non-kernel function F,
147  // then collect F.
148  Function *F = cast<Instruction>(U)->getFunction();
149  if (!AMDGPU::isKernelCC(F))
150  LDSAccessors.insert(F);
151  }
152 
153  return LDSAccessors;
154 }
155 
157 getFunctionToInstsMap(User *U, bool CollectKernelInsts) {
159  SmallVector<User *, 8> UserStack;
160  SmallPtrSet<User *, 8> VisitedUsers;
161 
162  UserStack.push_back(U);
163 
164  while (!UserStack.empty()) {
165  auto *UU = UserStack.pop_back_val();
166 
167  if (!VisitedUsers.insert(UU).second)
168  continue;
169 
170  if (isa<GlobalValue>(UU))
171  continue;
172 
173  if (isa<Constant>(UU)) {
174  append_range(UserStack, UU->users());
175  continue;
176  }
177 
178  auto *I = cast<Instruction>(UU);
179  Function *F = I->getFunction();
180  if (CollectKernelInsts) {
181  if (!AMDGPU::isKernelCC(F)) {
182  continue;
183  }
184  } else {
185  if (AMDGPU::isKernelCC(F)) {
186  continue;
187  }
188  }
189 
190  FunctionToInsts.insert(std::make_pair(F, SmallPtrSet<Instruction *, 8>()));
191  FunctionToInsts[F].insert(I);
192  }
193 
194  return FunctionToInsts;
195 }
196 
197 bool isKernelCC(const Function *Func) {
198  return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv());
199 }
200 
202  return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL),
203  GV->getValueType());
204 }
205 
206 static void collectFunctionUses(User *U, const Function *F,
207  SetVector<Instruction *> &InstUsers) {
208  SmallVector<User *> Stack{U};
209 
210  while (!Stack.empty()) {
211  U = Stack.pop_back_val();
212 
213  if (auto *I = dyn_cast<Instruction>(U)) {
214  if (I->getFunction() == F)
215  InstUsers.insert(I);
216  continue;
217  }
218 
219  if (!isa<ConstantExpr>(U))
220  continue;
221 
222  append_range(Stack, U->users());
223  }
224 }
225 
227  SetVector<Instruction *> InstUsers;
228 
229  collectFunctionUses(C, F, InstUsers);
230  for (Instruction *I : InstUsers) {
232  }
233 }
234 
238 
239  while (!Stack.empty()) {
240  const User *U = Stack.pop_back_val();
241 
242  if (!Visited.insert(U).second)
243  continue;
244 
245  if (isa<Instruction>(U))
246  return true;
247 
248  append_range(Stack, U->users());
249  }
250 
251  return false;
252 }
253 
255  // We are not interested in kernel LDS lowering for module LDS itself.
256  if (F && GV.getName() == "llvm.amdgcn.module.lds")
257  return false;
258 
259  bool Ret = false;
263 
264  assert(!F || isKernelCC(F));
265 
266  while (!Stack.empty()) {
267  const User *V = Stack.pop_back_val();
268  Visited.insert(V);
269 
270  if (auto *G = dyn_cast<GlobalValue>(V)) {
271  StringRef GName = G->getName();
272  if (F && GName != "llvm.used" && GName != "llvm.compiler.used") {
273  // For kernel LDS lowering, if G is not a compiler.used list, then we
274  // cannot lower the lds GV since we cannot replace the use of GV within
275  // G.
276  return false;
277  }
278  GlobalUsers.insert(G);
279  continue;
280  }
281 
282  if (auto *I = dyn_cast<Instruction>(V)) {
283  const Function *UF = I->getFunction();
284  if (UF == F) {
285  // Used from this kernel, we want to put it into the structure.
286  Ret = true;
287  } else if (!F) {
288  // For module LDS lowering, lowering is required if the user instruction
289  // is from non-kernel function.
290  Ret |= !isKernelCC(UF);
291  }
292  continue;
293  }
294 
295  // User V should be a constant, recursively visit users of V.
296  assert(isa<Constant>(V) && "Expected a constant.");
297  append_range(Stack, V->users());
298  }
299 
300  if (!F && !Ret) {
301  // For module LDS lowering, we have not yet decided if we should lower GV or
302  // not. Explore all global users of GV, and check if atleast one of these
303  // global users appear as an use within an instruction (possibly nested use
304  // via constant expression), if so, then conservately lower LDS.
305  for (auto *G : GlobalUsers)
307  }
308 
309  return Ret;
310 }
311 
312 std::vector<GlobalVariable *> findVariablesToLower(Module &M,
313  const Function *F) {
314  std::vector<llvm::GlobalVariable *> LocalVars;
315  for (auto &GV : M.globals()) {
316  if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
317  continue;
318  }
319  if (!GV.hasInitializer()) {
320  // addrspace(3) without initializer implies cuda/hip extern __shared__
321  // the semantics for such a variable appears to be that all extern
322  // __shared__ variables alias one another, in which case this transform
323  // is not required
324  continue;
325  }
326  if (!isa<UndefValue>(GV.getInitializer())) {
327  // Initializers are unimplemented for local address space.
328  // Leave such variables in place for consistent error reporting.
329  continue;
330  }
331  if (GV.isConstant()) {
332  // A constant undef variable can't be written to, and any load is
333  // undef, so it should be eliminated by the optimizer. It could be
334  // dropped by the back end if not. This pass skips over it.
335  continue;
336  }
337  if (!shouldLowerLDSToStruct(GV, F)) {
338  continue;
339  }
340  LocalVars.push_back(&GV);
341  }
342  return LocalVars;
343 }
344 
347 
349  collectUsedGlobalVariables(M, TmpVec, true);
350  UsedList.insert(TmpVec.begin(), TmpVec.end());
351 
352  TmpVec.clear();
353  collectUsedGlobalVariables(M, TmpVec, false);
354  UsedList.insert(TmpVec.begin(), TmpVec.end());
355 
356  return UsedList;
357 }
358 
359 } // end namespace AMDGPU
360 
361 } // end namespace llvm
llvm::AMDGPU::findVariablesToLower
std::vector< GlobalVariable * > findVariablesToLower(Module &M, const Function *F)
Definition: AMDGPULDSUtils.cpp:312
llvm
This file implements support for optimizing divisions by a constant.
Definition: AllocatorList.h:23
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::Value::getPointerAlignment
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:917
llvm::CallGraph::getExternalCallingNode
CallGraphNode * getExternalCallingNode() const
Returns the CallGraphNode which is used to represent undetermined calls into the callgraph.
Definition: CallGraph.h:128
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::Function
Definition: Function.h:62
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::AMDGPU::getFunctionToInstsMap
DenseMap< Function *, SmallPtrSet< Instruction *, 8 > > getFunctionToInstsMap(User *U, bool CollectKernelInsts)
Collect all the instructions where user U belongs to.
Definition: AMDGPULDSUtils.cpp:157
llvm::GlobalVariable
Definition: GlobalVariable.h:40
llvm::df_end
df_iterator< T > df_end(const T &G)
Definition: DepthFirstIterator.h:223
llvm::CallGraph
The basic data container for the call graph of a Module of IR.
Definition: CallGraph.h:73
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:449
llvm::MipsISD::Ret
@ Ret
Definition: MipsISelLowering.h:116
llvm::SmallVectorImpl::pop_back_val
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:635
DepthFirstIterator.h
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::collectUsedGlobalVariables
GlobalVariable * collectUsedGlobalVariables(const Module &M, SmallVectorImpl< GlobalValue * > &Vec, bool CompilerUsed)
Given "llvm.used" or "llvm.compiler.used" as a global name, collect the initializer elements of that ...
Definition: Module.cpp:778
llvm::AMDGPU::getUsedList
SmallPtrSet< GlobalValue *, 32 > getUsedList(Module &M)
Definition: AMDGPULDSUtils.cpp:345
AMDGPULDSUtils.h
llvm::AMDGPU::CollectReachableCallees::collectReachableCallees
void collectReachableCallees(DenseMap< Function *, SmallPtrSet< Function *, 8 >> &KernelToCallees)
Definition: AMDGPULDSUtils.cpp:105
Constants.h
llvm::SmallVectorImpl::append
void append(in_iter in_start, in_iter in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:648
llvm::User
Definition: User.h:44
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:363
llvm::convertConstantExprsToInstructions
void convertConstantExprsToInstructions(Instruction *I, ConstantExpr *CE, SmallPtrSetImpl< Instruction * > *Insts=nullptr)
The given instruction I contains given constant expression CE as one of its operands,...
Definition: ReplaceConstant.cpp:28
llvm::AMDGPU::collectNonKernelAccessorsOfLDS
SmallPtrSet< Function *, 8 > collectNonKernelAccessorsOfLDS(GlobalVariable *GV)
For the given LDS global GV, visit all its users and collect all non-kernel functions within which GV...
Definition: AMDGPULDSUtils.cpp:124
llvm::Instruction
Definition: Instruction.h:45
llvm::AMDGPU::CollectReachableCallees::CollectReachableCallees
CollectReachableCallees(Module &M)
Definition: AMDGPULDSUtils.cpp:100
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::AMDGPU::collectFunctionUses
static void collectFunctionUses(User *U, const Function *F, SetVector< Instruction * > &InstUsers)
Definition: AMDGPULDSUtils.cpp:206
G
const DataFlowGraph & G
Definition: RDFGraph.cpp:202
llvm::GlobalValue
Definition: GlobalValue.h:44
llvm::AMDGPU::replaceConstantUsesInFunction
void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F)
Replace all uses of constant C with instructions in F.
Definition: AMDGPULDSUtils.cpp:226
llvm::DenseMap
Definition: DenseMap.h:714
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::df_begin
df_iterator< T > df_begin(const T &G)
Definition: DepthFirstIterator.h:218
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
llvm::SetVector::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:141
llvm::AArch64CC::GE
@ GE
Definition: AArch64BaseInfo.h:265
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:58
AMDGPU.h
llvm::AMDGPU::isModuleEntryFunctionCC
bool isModuleEntryFunctionCC(CallingConv::ID CC)
Definition: AMDGPUBaseInfo.cpp:1400
llvm::AMDGPU::shouldLowerLDSToStruct
bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F)
Definition: AMDGPULDSUtils.cpp:254
llvm::append_range
void append_range(Container &C, Range &&R)
Wrapper function to append a range to a container.
Definition: STLExtras.h:1748
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::Value::getName
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:207
llvm::AMDGPU::getAlign
Align getAlign(DataLayout const &DL, const GlobalVariable *GV)
Definition: AMDGPULDSUtils.cpp:201
llvm::ConstantExpr
A constant value that is initialized with an expression using other constant values.
Definition: Constants.h:936
llvm::AMDGPU::collectReachableCallees
void collectReachableCallees(Module &M, DenseMap< Function *, SmallPtrSet< Function *, 8 >> &KernelToCallees)
Collect reachable callees for each kernel defined in the module M and return collected callees at Ker...
Definition: AMDGPULDSUtils.cpp:117
llvm::SmallVectorImpl::clear
void clear()
Definition: SmallVector.h:585
CallGraph.h
ReplaceConstant.h
llvm::AMDGPU::hasUserInstruction
bool hasUserInstruction(const GlobalValue *GV)
Definition: AMDGPULDSUtils.cpp:235
llvm::GlobalValue::getValueType
Type * getValueType() const
Definition: GlobalValue.h:273
llvm::SetVector
A vector that has set insertion semantics.
Definition: SetVector.h:40
llvm::AMDGPU::isKernelCC
bool isKernelCC(const Function *Func)
Definition: AMDGPULDSUtils.cpp:197
llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:422
SetVector.h
llvm::AMDGPU::CollectReachableCallees
Definition: AMDGPULDSUtils.cpp:30
AMDGPUBaseInfo.h
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364