LLVM 20.0.0git
AMDGPUMemoryUtils.cpp
Go to the documentation of this file.
1//===-- AMDGPUMemoryUtils.cpp - -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "AMDGPUMemoryUtils.h"
10#include "AMDGPU.h"
13#include "llvm/ADT/SmallSet.h"
17#include "llvm/IR/DataLayout.h"
20#include "llvm/IR/IntrinsicsAMDGPU.h"
22
23#define DEBUG_TYPE "amdgpu-memory-utils"
24
25using namespace llvm;
26
27namespace llvm::AMDGPU {
28
30 return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL),
31 GV->getValueType());
32}
33
35 // TODO: Allow arrays and structs, if all members are barriers
36 // in the same scope.
37 // TODO: Disallow other uses of target("amdgcn.named.barrier") including:
38 // - Structs containing barriers in different scope.
39 // - Structs containing a mixture of barriers and other data.
40 // - Globals in other address spaces.
41 // - Allocas.
42 Type *Ty = GV.getValueType();
43 while (true) {
44 if (auto *TTy = dyn_cast<TargetExtType>(Ty))
45 return TTy->getName() == "amdgcn.named.barrier" ? TTy : nullptr;
46 if (auto *STy = dyn_cast<StructType>(Ty)) {
47 if (STy->getNumElements() == 0)
48 return nullptr;
49 Ty = STy->getElementType(0);
50 continue;
51 }
52 return nullptr;
53 }
54}
55
57 // external zero size addrspace(3) without initializer is dynlds.
58 const Module *M = GV.getParent();
59 const DataLayout &DL = M->getDataLayout();
61 return false;
62 return DL.getTypeAllocSize(GV.getValueType()) == 0;
63}
64
67 return false;
68 }
69 if (isDynamicLDS(GV)) {
70 return true;
71 }
72 if (GV.isConstant()) {
73 // A constant undef variable can't be written to, and any load is
74 // undef, so it should be eliminated by the optimizer. It could be
75 // dropped by the back end if not. This pass skips over it.
76 return false;
77 }
78 if (GV.hasInitializer() && !isa<UndefValue>(GV.getInitializer())) {
79 // Initializers are unimplemented for LDS address space.
80 // Leave such variables in place for consistent error reporting.
81 return false;
82 }
83 return true;
84}
85
87 // Constants are uniqued within LLVM. A ConstantExpr referring to a LDS
88 // global may have uses from multiple different functions as a result.
89 // This pass specialises LDS variables with respect to the kernel that
90 // allocates them.
91
92 // This is semantically equivalent to (the unimplemented as slow):
93 // for (auto &F : M.functions())
94 // for (auto &BB : F)
95 // for (auto &I : BB)
96 // for (Use &Op : I.operands())
97 // if (constantExprUsesLDS(Op))
98 // replaceConstantExprInFunction(I, Op);
99
100 SmallVector<Constant *> LDSGlobals;
101 for (auto &GV : M.globals())
103 LDSGlobals.push_back(&GV);
104 return convertUsersOfConstantsToInstructions(LDSGlobals);
105}
106
108 FunctionVariableMap &kernels,
109 FunctionVariableMap &Functions) {
110 // Get uses from the current function, excluding uses by called Functions
111 // Two output variables to avoid walking the globals list twice
112 for (auto &GV : M.globals()) {
114 continue;
115 for (User *V : GV.users()) {
116 if (auto *I = dyn_cast<Instruction>(V)) {
117 Function *F = I->getFunction();
118 if (isKernelLDS(F))
119 kernels[F].insert(&GV);
120 else
121 Functions[F].insert(&GV);
122 }
123 }
124 }
125}
126
127bool isKernelLDS(const Function *F) {
128 // Some weirdness here. AMDGPU::isKernelCC does not call into
129 // AMDGPU::isKernel with the calling conv, it instead calls into
130 // isModuleEntryFunction which returns true for more calling conventions
131 // than AMDGPU::isKernel does. There's a FIXME on AMDGPU::isKernel.
132 // There's also a test that checks that the LDS lowering does not hit on
133 // a graphics shader, denoted amdgpu_ps, so stay with the limited case.
134 // Putting LDS in the name of the function to draw attention to this.
135 return AMDGPU::isKernel(F->getCallingConv());
136}
137
139
140 FunctionVariableMap DirectMapKernel;
141 FunctionVariableMap DirectMapFunction;
142 getUsesOfLDSByFunction(CG, M, DirectMapKernel, DirectMapFunction);
143
144 // Collect variables that are used by functions whose address has escaped
145 DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
146 for (Function &F : M.functions()) {
147 if (!isKernelLDS(&F))
148 if (F.hasAddressTaken(nullptr,
149 /* IgnoreCallbackUses */ false,
150 /* IgnoreAssumeLikeCalls */ false,
151 /* IgnoreLLVMUsed */ true,
152 /* IgnoreArcAttachedCall */ false)) {
153 set_union(VariablesReachableThroughFunctionPointer,
154 DirectMapFunction[&F]);
155 }
156 }
157
158 auto FunctionMakesUnknownCall = [&](const Function *F) -> bool {
159 assert(!F->isDeclaration());
160 for (const CallGraphNode::CallRecord &R : *CG[F]) {
161 if (!R.second->getFunction())
162 return true;
163 }
164 return false;
165 };
166
167 // Work out which variables are reachable through function calls
168 FunctionVariableMap TransitiveMapFunction = DirectMapFunction;
169
170 // If the function makes any unknown call, assume the worst case that it can
171 // access all variables accessed by functions whose address escaped
172 for (Function &F : M.functions()) {
173 if (!F.isDeclaration() && FunctionMakesUnknownCall(&F)) {
174 if (!isKernelLDS(&F)) {
175 set_union(TransitiveMapFunction[&F],
176 VariablesReachableThroughFunctionPointer);
177 }
178 }
179 }
180
181 // Direct implementation of collecting all variables reachable from each
182 // function
183 for (Function &Func : M.functions()) {
184 if (Func.isDeclaration() || isKernelLDS(&Func))
185 continue;
186
187 DenseSet<Function *> seen; // catches cycles
188 SmallVector<Function *, 4> wip = {&Func};
189
190 while (!wip.empty()) {
191 Function *F = wip.pop_back_val();
192
193 // Can accelerate this by referring to transitive map for functions that
194 // have already been computed, with more care than this
195 set_union(TransitiveMapFunction[&Func], DirectMapFunction[F]);
196
197 for (const CallGraphNode::CallRecord &R : *CG[F]) {
198 Function *Ith = R.second->getFunction();
199 if (Ith) {
200 if (!seen.contains(Ith)) {
201 seen.insert(Ith);
202 wip.push_back(Ith);
203 }
204 }
205 }
206 }
207 }
208
209 // DirectMapKernel lists which variables are used by the kernel
210 // find the variables which are used through a function call
211 FunctionVariableMap IndirectMapKernel;
212
213 for (Function &Func : M.functions()) {
214 if (Func.isDeclaration() || !isKernelLDS(&Func))
215 continue;
216
217 for (const CallGraphNode::CallRecord &R : *CG[&Func]) {
218 Function *Ith = R.second->getFunction();
219 if (Ith) {
220 set_union(IndirectMapKernel[&Func], TransitiveMapFunction[Ith]);
221 } else {
222 set_union(IndirectMapKernel[&Func],
223 VariablesReachableThroughFunctionPointer);
224 }
225 }
226 }
227
228 // Verify that we fall into one of 2 cases:
229 // - All variables are either absolute
230 // or direct mapped dynamic LDS that is not lowered.
231 // this is a re-run of the pass
232 // so we don't have anything to do.
233 // - No variables are absolute.
234 std::optional<bool> HasAbsoluteGVs;
235 bool HasSpecialGVs = false;
236 for (auto &Map : {DirectMapKernel, IndirectMapKernel}) {
237 for (auto &[Fn, GVs] : Map) {
238 for (auto *GV : GVs) {
239 bool IsAbsolute = GV->isAbsoluteSymbolRef();
240 bool IsDirectMapDynLDSGV =
241 AMDGPU::isDynamicLDS(*GV) && DirectMapKernel.contains(Fn);
242 if (IsDirectMapDynLDSGV)
243 continue;
244 if (isNamedBarrier(*GV)) {
245 HasSpecialGVs = true;
246 continue;
247 }
248 if (HasAbsoluteGVs.has_value()) {
249 if (*HasAbsoluteGVs != IsAbsolute) {
251 "Module cannot mix absolute and non-absolute LDS GVs");
252 }
253 } else
254 HasAbsoluteGVs = IsAbsolute;
255 }
256 }
257 }
258
259 // If we only had absolute GVs, we have nothing to do, return an empty
260 // result.
261 if (HasAbsoluteGVs && *HasAbsoluteGVs)
262 return {FunctionVariableMap(), FunctionVariableMap(), false};
263
264 return {std::move(DirectMapKernel), std::move(IndirectMapKernel),
265 HasSpecialGVs};
266}
267
269 ArrayRef<StringRef> FnAttrs) {
270 for (StringRef Attr : FnAttrs)
271 KernelRoot->removeFnAttr(Attr);
272
273 SmallVector<Function *> WorkList = {CG[KernelRoot]->getFunction()};
275 bool SeenUnknownCall = false;
276
277 while (!WorkList.empty()) {
278 Function *F = WorkList.pop_back_val();
279
280 for (auto &CallRecord : *CG[F]) {
281 if (!CallRecord.second)
282 continue;
283
284 Function *Callee = CallRecord.second->getFunction();
285 if (!Callee) {
286 if (!SeenUnknownCall) {
287 SeenUnknownCall = true;
288
289 // If we see any indirect calls, assume nothing about potential
290 // targets.
291 // TODO: This could be refined to possible LDS global users.
292 for (auto &ExternalCallRecord : *CG.getExternalCallingNode()) {
293 Function *PotentialCallee =
294 ExternalCallRecord.second->getFunction();
295 assert(PotentialCallee);
296 if (!isKernelLDS(PotentialCallee)) {
297 for (StringRef Attr : FnAttrs)
298 PotentialCallee->removeFnAttr(Attr);
299 }
300 }
301 }
302 } else {
303 for (StringRef Attr : FnAttrs)
304 Callee->removeFnAttr(Attr);
305 if (Visited.insert(Callee).second)
306 WorkList.push_back(Callee);
307 }
308 }
309 }
310}
311
313 Instruction *DefInst = Def->getMemoryInst();
314
315 if (isa<FenceInst>(DefInst))
316 return false;
317
318 if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
319 switch (II->getIntrinsicID()) {
320 case Intrinsic::amdgcn_s_barrier:
321 case Intrinsic::amdgcn_s_barrier_signal:
322 case Intrinsic::amdgcn_s_barrier_signal_var:
323 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
324 case Intrinsic::amdgcn_s_barrier_init:
325 case Intrinsic::amdgcn_s_barrier_join:
326 case Intrinsic::amdgcn_s_barrier_wait:
327 case Intrinsic::amdgcn_s_barrier_leave:
328 case Intrinsic::amdgcn_s_get_barrier_state:
329 case Intrinsic::amdgcn_s_wakeup_barrier:
330 case Intrinsic::amdgcn_wave_barrier:
331 case Intrinsic::amdgcn_sched_barrier:
332 case Intrinsic::amdgcn_sched_group_barrier:
333 return false;
334 default:
335 break;
336 }
337 }
338
339 // Ignore atomics not aliasing with the original load, any atomic is a
340 // universal MemoryDef from MSSA's point of view too, just like a fence.
341 const auto checkNoAlias = [AA, Ptr](auto I) -> bool {
342 return I && AA->isNoAlias(I->getPointerOperand(), Ptr);
343 };
344
345 if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DefInst)) ||
346 checkNoAlias(dyn_cast<AtomicRMWInst>(DefInst)))
347 return false;
348
349 return true;
350}
351
353 AAResults *AA) {
354 MemorySSAWalker *Walker = MSSA->getWalker();
358
359 LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n');
360
361 // Start with a nearest dominating clobbering access, it will be either
362 // live on entry (nothing to do, load is not clobbered), MemoryDef, or
363 // MemoryPhi if several MemoryDefs can define this memory state. In that
364 // case add all Defs to WorkList and continue going up and checking all
365 // the definitions of this memory location until the root. When all the
366 // defs are exhausted and came to the entry state we have no clobber.
367 // Along the scan ignore barriers and fences which are considered clobbers
368 // by the MemorySSA, but not really writing anything into the memory.
369 while (!WorkList.empty()) {
370 MemoryAccess *MA = WorkList.pop_back_val();
371 if (!Visited.insert(MA).second)
372 continue;
373
374 if (MSSA->isLiveOnEntryDef(MA))
375 continue;
376
377 if (MemoryDef *Def = dyn_cast<MemoryDef>(MA)) {
378 LLVM_DEBUG(dbgs() << " Def: " << *Def->getMemoryInst() << '\n');
379
380 if (isReallyAClobber(Load->getPointerOperand(), Def, AA)) {
381 LLVM_DEBUG(dbgs() << " -> load is clobbered\n");
382 return true;
383 }
384
385 WorkList.push_back(
386 Walker->getClobberingMemoryAccess(Def->getDefiningAccess(), Loc));
387 continue;
388 }
389
390 const MemoryPhi *Phi = cast<MemoryPhi>(MA);
391 for (const auto &Use : Phi->incoming_values())
392 WorkList.push_back(cast<MemoryAccess>(&Use));
393 }
394
395 LLVM_DEBUG(dbgs() << " -> no clobber\n");
396 return false;
397}
398
399} // end namespace llvm::AMDGPU
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
#define LLVM_DEBUG(...)
Definition: Debug.h:106
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
uint64_t IntrinsicInst * II
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines generic set operations that may be used on set's of different types,...
This file defines the SmallSet class.
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A trivial helper function to check to see if the specified pointers are no-alias.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
std::pair< std::optional< WeakTrackingVH >, CallGraphNode * > CallRecord
A pair of the calling instruction (a call or invoke) and the call graph node being called.
Definition: CallGraph.h:177
The basic data container for the call graph of a Module of IR.
Definition: CallGraph.h:71
CallGraphNode * getExternalCallingNode() const
Returns the CallGraphNode which is used to represent undetermined calls into the callgraph.
Definition: CallGraph.h:126
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:147
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
const Function & getFunction() const
Definition: Function.h:171
void removeFnAttr(Attribute::AttrKind Kind)
Remove function attributes from this function.
Definition: Function.cpp:689
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:294
Type * getValueType() const
Definition: GlobalValue.h:296
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
bool isConstant() const
If the value is a global constant, its value is immutable throughout the runtime execution of the pro...
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
An instruction for reading from memory.
Definition: Instructions.h:176
Represents a read-write access to memory, whether it is a must-alias, or a may-alias.
Definition: MemorySSA.h:370
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
Represents phi nodes for memory accesses.
Definition: MemorySSA.h:478
This is the generic walker interface for walkers of MemorySSA.
Definition: MemorySSA.h:1016
MemoryAccess * getClobberingMemoryAccess(const Instruction *I, BatchAAResults &AA)
Given a memory Mod/Ref/ModRef'ing instruction, calling this will give you the nearest dominating Memo...
Definition: MemorySSA.h:1045
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition: MemorySSA.h:701
MemorySSAWalker * getWalker()
Definition: MemorySSA.cpp:1590
bool isLiveOnEntryDef(const MemoryAccess *MA) const
Return true if MA represents the live on entry value.
Definition: MemorySSA.h:739
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
bool empty() const
Definition: SmallVector.h:81
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
Class to represent target extensions types, which are generally unintrospectable from target-independ...
Definition: DerivedTypes.h:744
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:927
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
@ LOCAL_ADDRESS
Address space for local memory.
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isDynamicLDS(const GlobalVariable &GV)
void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot, ArrayRef< StringRef > FnAttrs)
Strip FnAttr attribute from any functions where we may have introduced its use.
void getUsesOfLDSByFunction(const CallGraph &CG, Module &M, FunctionVariableMap &kernels, FunctionVariableMap &Functions)
bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA)
Given a Def clobbering a load from Ptr according to the MSSA check if this is actually a memory updat...
LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isLDSVariableToLower(const GlobalVariable &GV)
bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M)
Align getAlign(const DataLayout &DL, const GlobalVariable *GV)
bool isKernelLDS(const Function *F)
bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA, AAResults *AA)
Check is a Load is clobbered in its function.
DenseMap< Function *, DenseSet< GlobalVariable * > > FunctionVariableMap
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
bool set_union(S1Ty &S1, const S2Ty &S2)
set_union(A, B) - Compute A := A u B, return whether A changed.
Definition: SetOperations.h:43
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39