LLVM 17.0.0git
AMDGPULowerKernelArguments.cpp
Go to the documentation of this file.
1//===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This pass replaces accesses to kernel arguments with loads from
10/// offsets from the kernarg base pointer.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "GCNSubtarget.h"
17#include "llvm/IR/IntrinsicsAMDGPU.h"
18#include "llvm/IR/IRBuilder.h"
19#include "llvm/IR/MDBuilder.h"
21#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
22
23using namespace llvm;
24
25namespace {
26
27class AMDGPULowerKernelArguments : public FunctionPass{
28public:
29 static char ID;
30
31 AMDGPULowerKernelArguments() : FunctionPass(ID) {}
32
33 bool runOnFunction(Function &F) override;
34
35 void getAnalysisUsage(AnalysisUsage &AU) const override {
37 AU.setPreservesAll();
38 }
39};
40
41} // end anonymous namespace
42
43// skip allocas
46 for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) {
47 AllocaInst *AI = dyn_cast<AllocaInst>(&*InsPt);
48
49 // If this is a dynamic alloca, the value may depend on the loaded kernargs,
50 // so loads will need to be inserted before it.
51 if (!AI || !AI->isStaticAlloca())
52 break;
53 }
54
55 return InsPt;
56}
57
58bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
59 CallingConv::ID CC = F.getCallingConv();
60 if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
61 return false;
62
63 auto &TPC = getAnalysis<TargetPassConfig>();
64
65 const TargetMachine &TM = TPC.getTM<TargetMachine>();
66 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
67 LLVMContext &Ctx = F.getParent()->getContext();
68 const DataLayout &DL = F.getParent()->getDataLayout();
69 BasicBlock &EntryBlock = *F.begin();
70 IRBuilder<> Builder(&*getInsertPt(EntryBlock));
71
72 const Align KernArgBaseAlign(16); // FIXME: Increase if necessary
73 const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
74
75 Align MaxAlign;
76 // FIXME: Alignment is broken with explicit arg offset.;
77 const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);
78 if (TotalKernArgSize == 0)
79 return false;
80
81 CallInst *KernArgSegment =
82 Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {}, {},
83 nullptr, F.getName() + ".kernarg.segment");
84
85 KernArgSegment->addRetAttr(Attribute::NonNull);
86 KernArgSegment->addRetAttr(
87 Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
88
89 unsigned AS = KernArgSegment->getType()->getPointerAddressSpace();
90 uint64_t ExplicitArgOffset = 0;
91
92 for (Argument &Arg : F.args()) {
93 const bool IsByRef = Arg.hasByRefAttr();
94 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
95 MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;
96 Align ABITypeAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);
97
98 uint64_t Size = DL.getTypeSizeInBits(ArgTy);
99 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
100
101 uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset;
102 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
103
104 if (Arg.use_empty())
105 continue;
106
107 // If this is byval, the loads are already explicit in the function. We just
108 // need to rewrite the pointer values.
109 if (IsByRef) {
110 Value *ArgOffsetPtr = Builder.CreateConstInBoundsGEP1_64(
111 Builder.getInt8Ty(), KernArgSegment, EltOffset,
112 Arg.getName() + ".byval.kernarg.offset");
113
114 Value *CastOffsetPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
115 ArgOffsetPtr, Arg.getType());
116 Arg.replaceAllUsesWith(CastOffsetPtr);
117 continue;
118 }
119
120 if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) {
121 // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
122 // modes on SI to know the high bits are 0 so pointer adds don't wrap. We
123 // can't represent this with range metadata because it's only allowed for
124 // integer types.
125 if ((PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
126 PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) &&
127 !ST.hasUsableDSOffset())
128 continue;
129
130 // FIXME: We can replace this with equivalent alias.scope/noalias
131 // metadata, but this appears to be a lot of work.
132 if (Arg.hasNoAliasAttr())
133 continue;
134 }
135
136 auto *VT = dyn_cast<FixedVectorType>(ArgTy);
137 bool IsV3 = VT && VT->getNumElements() == 3;
138 bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType();
139
140 VectorType *V4Ty = nullptr;
141
142 int64_t AlignDownOffset = alignDown(EltOffset, 4);
143 int64_t OffsetDiff = EltOffset - AlignDownOffset;
144 Align AdjustedAlign = commonAlignment(
145 KernArgBaseAlign, DoShiftOpt ? AlignDownOffset : EltOffset);
146
147 Value *ArgPtr;
148 Type *AdjustedArgTy;
149 if (DoShiftOpt) { // FIXME: Handle aggregate types
150 // Since we don't have sub-dword scalar loads, avoid doing an extload by
151 // loading earlier than the argument address, and extracting the relevant
152 // bits.
153 //
154 // Additionally widen any sub-dword load to i32 even if suitably aligned,
155 // so that CSE between different argument loads works easily.
156 ArgPtr = Builder.CreateConstInBoundsGEP1_64(
157 Builder.getInt8Ty(), KernArgSegment, AlignDownOffset,
158 Arg.getName() + ".kernarg.offset.align.down");
159 AdjustedArgTy = Builder.getInt32Ty();
160 } else {
161 ArgPtr = Builder.CreateConstInBoundsGEP1_64(
162 Builder.getInt8Ty(), KernArgSegment, EltOffset,
163 Arg.getName() + ".kernarg.offset");
164 AdjustedArgTy = ArgTy;
165 }
166
167 if (IsV3 && Size >= 32) {
168 V4Ty = FixedVectorType::get(VT->getElementType(), 4);
169 // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
170 AdjustedArgTy = V4Ty;
171 }
172
173 ArgPtr = Builder.CreateBitCast(ArgPtr, AdjustedArgTy->getPointerTo(AS),
174 ArgPtr->getName() + ".cast");
175 LoadInst *Load =
176 Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign);
177 Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));
178
179 MDBuilder MDB(Ctx);
180
181 if (isa<PointerType>(ArgTy)) {
182 if (Arg.hasNonNullAttr())
183 Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {}));
184
185 uint64_t DerefBytes = Arg.getDereferenceableBytes();
186 if (DerefBytes != 0) {
187 Load->setMetadata(
188 LLVMContext::MD_dereferenceable,
189 MDNode::get(Ctx,
190 MDB.createConstant(
191 ConstantInt::get(Builder.getInt64Ty(), DerefBytes))));
192 }
193
194 uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();
195 if (DerefOrNullBytes != 0) {
196 Load->setMetadata(
197 LLVMContext::MD_dereferenceable_or_null,
198 MDNode::get(Ctx,
199 MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
200 DerefOrNullBytes))));
201 }
202
203 if (MaybeAlign ParamAlign = Arg.getParamAlign()) {
204 Load->setMetadata(
205 LLVMContext::MD_align,
206 MDNode::get(Ctx, MDB.createConstant(ConstantInt::get(
207 Builder.getInt64Ty(), ParamAlign->value()))));
208 }
209 }
210
211 // TODO: Convert noalias arg to !noalias
212
213 if (DoShiftOpt) {
214 Value *ExtractBits = OffsetDiff == 0 ?
215 Load : Builder.CreateLShr(Load, OffsetDiff * 8);
216
217 IntegerType *ArgIntTy = Builder.getIntNTy(Size);
218 Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy);
219 Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy,
220 Arg.getName() + ".load");
221 Arg.replaceAllUsesWith(NewVal);
222 } else if (IsV3) {
223 Value *Shuf = Builder.CreateShuffleVector(Load, ArrayRef<int>{0, 1, 2},
224 Arg.getName() + ".load");
225 Arg.replaceAllUsesWith(Shuf);
226 } else {
227 Load->setName(Arg.getName() + ".load");
228 Arg.replaceAllUsesWith(Load);
229 }
230 }
231
232 KernArgSegment->addRetAttr(
233 Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
234
235 return true;
236}
237
238INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE,
239 "AMDGPU Lower Kernel Arguments", false, false)
240INITIALIZE_PASS_END(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments",
242
243char AMDGPULowerKernelArguments::ID = 0;
244
246 return new AMDGPULowerKernelArguments();
247}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
AMDGPU Lower Kernel Arguments
static BasicBlock::iterator getInsertPt(BasicBlock &BB)
#define DEBUG_TYPE
assume Assume Builder
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
#define F(x, y, z)
Definition: MD5.cpp:55
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
Target-Independent Code Generator Pass Configuration Options pass.
an instruction to allocate memory on the stack
Definition: Instructions.h:58
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
static Attribute getWithDereferenceableBytes(LLVMContext &Context, uint64_t Bytes)
Definition: Attributes.cpp:177
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:167
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
iterator end()
Definition: BasicBlock.h:325
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:254
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:87
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
Definition: InstrTypes.h:1528
This class represents a function call, abstracting a target machine's calling convention.
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:888
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:704
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2564
Class to represent integer types.
Definition: DerivedTypes.h:40
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:177
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1399
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:78
Target-Independent Code Generator Pass Configuration Options.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition: Type.h:297
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:308
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:378
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:381
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:197
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
FunctionPass * createAMDGPULowerKernelArgumentsPass()
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:533
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117