LLVM 20.0.0git
AMDGPUPromoteKernelArguments.cpp
Go to the documentation of this file.
1//===-- AMDGPUPromoteKernelArguments.cpp ----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This pass recursively promotes generic pointer arguments of a kernel
10/// into the global address space.
11///
12/// The pass walks kernel's pointer arguments, then loads from them. If a loaded
13/// value is a pointer and loaded pointer is unmodified in the kernel before the
14/// load, then promote loaded pointer to global. Then recursively continue.
15//
16//===----------------------------------------------------------------------===//
17
18#include "AMDGPU.h"
19#include "AMDGPUMemoryUtils.h"
23#include "llvm/IR/IRBuilder.h"
25
26#define DEBUG_TYPE "amdgpu-promote-kernel-arguments"
27
28using namespace llvm;
29
30namespace {
31
32class AMDGPUPromoteKernelArguments : public FunctionPass {
33 MemorySSA *MSSA;
34
35 AliasAnalysis *AA;
36
37 Instruction *ArgCastInsertPt;
38
40
41 void enqueueUsers(Value *Ptr);
42
43 bool promotePointer(Value *Ptr);
44
45 bool promoteLoad(LoadInst *LI);
46
47public:
48 static char ID;
49
50 AMDGPUPromoteKernelArguments() : FunctionPass(ID) {}
51
52 bool run(Function &F, MemorySSA &MSSA, AliasAnalysis &AA);
53
54 bool runOnFunction(Function &F) override;
55
56 void getAnalysisUsage(AnalysisUsage &AU) const override {
59 AU.setPreservesAll();
60 }
61};
62
63} // end anonymous namespace
64
65void AMDGPUPromoteKernelArguments::enqueueUsers(Value *Ptr) {
66 SmallVector<User *> PtrUsers(Ptr->users());
67
68 while (!PtrUsers.empty()) {
69 Instruction *U = dyn_cast<Instruction>(PtrUsers.pop_back_val());
70 if (!U)
71 continue;
72
73 switch (U->getOpcode()) {
74 default:
75 break;
76 case Instruction::Load: {
77 LoadInst *LD = cast<LoadInst>(U);
78 if (LD->getPointerOperand()->stripInBoundsOffsets() == Ptr &&
79 !AMDGPU::isClobberedInFunction(LD, MSSA, AA))
80 Ptrs.push_back(LD);
81
82 break;
83 }
84 case Instruction::GetElementPtr:
85 case Instruction::AddrSpaceCast:
86 case Instruction::BitCast:
87 if (U->getOperand(0)->stripInBoundsOffsets() == Ptr)
88 PtrUsers.append(U->user_begin(), U->user_end());
89 break;
90 }
91 }
92}
93
94bool AMDGPUPromoteKernelArguments::promotePointer(Value *Ptr) {
95 bool Changed = false;
96
97 LoadInst *LI = dyn_cast<LoadInst>(Ptr);
98 if (LI)
99 Changed |= promoteLoad(LI);
100
101 PointerType *PT = dyn_cast<PointerType>(Ptr->getType());
102 if (!PT)
103 return Changed;
104
105 if (PT->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
106 PT->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
107 PT->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS)
108 enqueueUsers(Ptr);
109
110 if (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
111 return Changed;
112
113 IRBuilder<> B(LI ? &*std::next(cast<Instruction>(Ptr)->getIterator())
114 : ArgCastInsertPt);
115
116 // Cast pointer to global address space and back to flat and let
117 // Infer Address Spaces pass to do all necessary rewriting.
118 PointerType *NewPT =
119 PointerType::get(PT->getContext(), AMDGPUAS::GLOBAL_ADDRESS);
120 Value *Cast =
121 B.CreateAddrSpaceCast(Ptr, NewPT, Twine(Ptr->getName(), ".global"));
122 Value *CastBack =
123 B.CreateAddrSpaceCast(Cast, PT, Twine(Ptr->getName(), ".flat"));
124 Ptr->replaceUsesWithIf(CastBack,
125 [Cast](Use &U) { return U.getUser() != Cast; });
126
127 return true;
128}
129
130bool AMDGPUPromoteKernelArguments::promoteLoad(LoadInst *LI) {
131 if (!LI->isSimple())
132 return false;
133
134 LI->setMetadata("amdgpu.noclobber", MDNode::get(LI->getContext(), {}));
135 return true;
136}
137
138// skip allocas
141 for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) {
142 AllocaInst *AI = dyn_cast<AllocaInst>(&*InsPt);
143
144 // If this is a dynamic alloca, the value may depend on the loaded kernargs,
145 // so loads will need to be inserted before it.
146 if (!AI || !AI->isStaticAlloca())
147 break;
148 }
149
150 return InsPt;
151}
152
153bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA,
154 AliasAnalysis &AA) {
155 if (skipFunction(F))
156 return false;
157
158 CallingConv::ID CC = F.getCallingConv();
159 if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
160 return false;
161
162 ArgCastInsertPt = &*getInsertPt(*F.begin());
163 this->MSSA = &MSSA;
164 this->AA = &AA;
165
166 for (Argument &Arg : F.args()) {
167 if (Arg.use_empty())
168 continue;
169
170 PointerType *PT = dyn_cast<PointerType>(Arg.getType());
171 if (!PT || (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS &&
172 PT->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS &&
173 PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS))
174 continue;
175
176 Ptrs.push_back(&Arg);
177 }
178
179 bool Changed = false;
180 while (!Ptrs.empty()) {
181 Value *Ptr = Ptrs.pop_back_val();
182 Changed |= promotePointer(Ptr);
183 }
184
185 return Changed;
186}
187
188bool AMDGPUPromoteKernelArguments::runOnFunction(Function &F) {
189 MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
190 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
191 return run(F, MSSA, AA);
192}
193
194INITIALIZE_PASS_BEGIN(AMDGPUPromoteKernelArguments, DEBUG_TYPE,
195 "AMDGPU Promote Kernel Arguments", false, false)
198INITIALIZE_PASS_END(AMDGPUPromoteKernelArguments, DEBUG_TYPE,
199 "AMDGPU Promote Kernel Arguments", false, false)
200
201char AMDGPUPromoteKernelArguments::ID = 0;
202
204 return new AMDGPUPromoteKernelArguments();
205}
206
210 MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
212 if (AMDGPUPromoteKernelArguments().run(F, MSSA, AA)) {
216 return PA;
217 }
218 return PreservedAnalyses::all();
219}
AMDGPU Promote Kernel Arguments
static BasicBlock::iterator getInsertPt(BasicBlock &BB)
#define DEBUG_TYPE
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define F(x, y, z)
Definition: MD5.cpp:55
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file defines the SmallVector class.
A manager for alias analyses.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
an instruction to allocate memory on the stack
Definition: Instructions.h:63
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:461
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:416
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2697
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
An instruction for reading from memory.
Definition: Instructions.h:176
bool isSimple() const
Definition: Instructions.h:247
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1543
An analysis that produces MemorySSA for a function.
Definition: MemorySSA.h:928
Legacy analysis pass which computes MemorySSA.
Definition: MemorySSA.h:985
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition: MemorySSA.h:701
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
void preserve()
Mark an analysis as preserved.
Definition: Analysis.h:131
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA, AAResults *AA)
Check is a Load is clobbered in its function.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
FunctionPass * createAMDGPUPromoteKernelArgumentsPass()
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)