20#include "llvm/IR/IntrinsicsAMDGPU.h"
24#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
30class PreloadKernelArgInfo {
34 unsigned NumFreeUserSGPRs;
36 enum HiddenArg :
unsigned {
50 struct HiddenArgInfo {
60 static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
61 {0, 4,
"_hidden_block_count_x"}, {4, 4,
"_hidden_block_count_y"},
62 {8, 4,
"_hidden_block_count_z"}, {12, 2,
"_hidden_group_size_x"},
63 {14, 2,
"_hidden_group_size_y"}, {16, 2,
"_hidden_group_size_z"},
64 {18, 2,
"_hidden_remainder_x"}, {20, 2,
"_hidden_remainder_y"},
65 {22, 2,
"_hidden_remainder_z"}};
67 static HiddenArg getHiddenArgFromOffset(
unsigned Offset) {
68 for (
unsigned I = 0;
I < END_HIDDEN_ARGS; ++
I)
70 return static_cast<HiddenArg
>(
I);
72 return END_HIDDEN_ARGS;
76 if (HA < END_HIDDEN_ARGS)
82 static const char *getHiddenArgName(HiddenArg HA) {
83 if (HA < END_HIDDEN_ARGS) {
84 return HiddenArgs[HA].Name;
96 Function *cloneFunctionWithPreloadImplicitArgs(
unsigned LastPreloadIndex) {
100 for (
unsigned I = 0;
I <= LastPreloadIndex; ++
I)
101 FTypes.
push_back(getHiddenArgType(Ctx, HiddenArg(
I)));
104 FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
112 F.getParent()->getFunctionList().insert(
F.getIterator(), NF);
124 AB.addAttribute(Attribute::InReg);
125 AB.addAttribute(
"amdgpu-hidden-argument");
127 for (
unsigned I = 0;
I <= LastPreloadIndex; ++
I) {
128 AL = AL.addParamAttributes(Ctx, NFArg->
getArgNo(), AB);
129 NFArg++->
setName(getHiddenArgName(HiddenArg(
I)));
133 F.replaceAllUsesWith(NF);
141 setInitialFreeUserSGPRsCount();
146 void setInitialFreeUserSGPRsCount() {
151 bool tryAllocPreloadSGPRs(
unsigned AllocSize,
uint64_t ArgOffset,
155 if (ArgOffset - LastExplicitArgOffset < 4 &&
161 unsigned Padding = ArgOffset - LastExplicitArgOffset;
162 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
163 unsigned NumPreloadSGPRs =
alignTo(AllocSize, 4) / 4;
164 if (NumPreloadSGPRs + PaddingSGPRs > NumFreeUserSGPRs)
167 NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
172 void tryAllocImplicitArgPreloadSGPRs(
uint64_t ImplicitArgsBaseOffset,
176 F.getParent(), Intrinsic::amdgcn_implicitarg_ptr);
183 for (
auto *U : ImplicitArgPtr->
users()) {
185 if (!CI || CI->
getParent()->getParent() != &
F)
188 for (
auto *U : CI->
users()) {
190 auto *Load = dyn_cast<LoadInst>(U);
195 Load = dyn_cast<LoadInst>(*U->user_begin());
198 if (!Load || !Load->isSimple())
203 Type *LoadTy = Load->getType();
204 HiddenArg HA = getHiddenArgFromOffset(
Offset);
205 if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
212 if (ImplicitArgLoads.
empty())
222 auto *PreloadEnd = std::find_if(
223 ImplicitArgLoads.
begin(), ImplicitArgLoads.
end(),
224 [&](
const std::pair<LoadInst *, unsigned> &Load) {
225 unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType());
226 unsigned LoadOffset = Load.second;
227 if (!tryAllocPreloadSGPRs(LoadSize,
228 LoadOffset + ImplicitArgsBaseOffset,
229 LastExplicitArgOffset))
232 LastExplicitArgOffset =
233 ImplicitArgsBaseOffset + LoadOffset + LoadSize;
237 if (PreloadEnd == ImplicitArgLoads.
begin())
240 unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);
241 Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);
243 for (
const auto *
I = ImplicitArgLoads.
begin();
I != PreloadEnd; ++
I) {
245 unsigned LoadOffset =
I->second;
246 unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);
247 unsigned Index = NF->
arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
274 AllocaInst *AI = dyn_cast<AllocaInst>(&*InsPt);
296 const Align KernArgBaseAlign(16);
297 const uint64_t BaseOffset = ST.getExplicitKernelArgOffset();
301 const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(
F, MaxAlign);
302 if (TotalKernArgSize == 0)
307 nullptr,
F.getName() +
".kernarg.segment");
308 KernArgSegment->
addRetAttr(Attribute::NonNull);
314 bool InPreloadSequence =
true;
315 PreloadKernelArgInfo PreloadInfo(
F, ST);
318 const bool IsByRef = Arg.hasByRefAttr();
319 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
320 MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;
321 Align ABITypeAlign =
DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);
324 uint64_t AllocSize =
DL.getTypeAllocSize(ArgTy);
326 uint64_t EltOffset =
alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset;
327 uint64_t LastExplicitArgOffset = ExplicitArgOffset;
328 ExplicitArgOffset =
alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
333 if (Arg.hasAttribute(
"amdgpu-hidden-argument"))
337 if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() &&
338 !Arg.getType()->isAggregateType())
339 if (PreloadInfo.tryAllocPreloadSGPRs(AllocSize, EltOffset,
340 LastExplicitArgOffset))
343 InPreloadSequence =
false;
352 Builder.
getInt8Ty(), KernArgSegment, EltOffset,
353 Arg.getName() +
".byval.kernarg.offset");
355 Value *CastOffsetPtr =
361 if (
PointerType *PT = dyn_cast<PointerType>(ArgTy)) {
368 !ST.hasUsableDSOffset())
373 if (Arg.hasNoAliasAttr())
377 auto *VT = dyn_cast<FixedVectorType>(ArgTy);
378 bool IsV3 = VT && VT->getNumElements() == 3;
383 int64_t AlignDownOffset =
alignDown(EltOffset, 4);
384 int64_t OffsetDiff = EltOffset - AlignDownOffset;
386 KernArgBaseAlign, DoShiftOpt ? AlignDownOffset : EltOffset);
399 Builder.
getInt8Ty(), KernArgSegment, AlignDownOffset,
400 Arg.
getName() +
".kernarg.offset.align.down");
404 Builder.
getInt8Ty(), KernArgSegment, EltOffset,
405 Arg.getName() +
".kernarg.offset");
406 AdjustedArgTy = ArgTy;
409 if (IsV3 &&
Size >= 32) {
412 AdjustedArgTy = V4Ty;
417 Load->setMetadata(LLVMContext::MD_invariant_load,
MDNode::get(Ctx, {}));
421 if (Arg.hasAttribute(Attribute::NoUndef))
422 Load->setMetadata(LLVMContext::MD_noundef,
MDNode::get(Ctx, {}));
424 if (Arg.hasAttribute(Attribute::Range)) {
426 Arg.getAttribute(Attribute::Range).getValueAsConstantRange();
427 Load->setMetadata(LLVMContext::MD_range,
431 if (isa<PointerType>(ArgTy)) {
432 if (Arg.hasNonNullAttr())
433 Load->setMetadata(LLVMContext::MD_nonnull,
MDNode::get(Ctx, {}));
435 uint64_t DerefBytes = Arg.getDereferenceableBytes();
436 if (DerefBytes != 0) {
438 LLVMContext::MD_dereferenceable,
441 ConstantInt::get(Builder.
getInt64Ty(), DerefBytes))));
444 uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();
445 if (DerefOrNullBytes != 0) {
447 LLVMContext::MD_dereferenceable_or_null,
450 DerefOrNullBytes))));
453 if (
MaybeAlign ParamAlign = Arg.getParamAlign()) {
455 LLVMContext::MD_align,
457 Builder.
getInt64Ty(), ParamAlign->value()))));
464 Value *ExtractBits = OffsetDiff == 0 ?
465 Load : Builder.
CreateLShr(Load, OffsetDiff * 8);
470 Arg.getName() +
".load");
474 Arg.getName() +
".load");
477 Load->setName(Arg.getName() +
".load");
478 Arg.replaceAllUsesWith(Load);
485 if (InPreloadSequence) {
487 alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
489 PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset,
490 ExplicitArgOffset, Builder);
496bool AMDGPULowerKernelArguments::runOnFunction(
Function &
F) {
497 auto &TPC = getAnalysis<TargetPassConfig>();
503 "AMDGPU Lower Kernel Arguments",
false,
false)
507char AMDGPULowerKernelArguments::
ID = 0;
510 return new AMDGPULowerKernelArguments();
AMDGPU Lower Kernel Arguments
static BasicBlock::iterator getInsertPt(BasicBlock &BB)
static bool lowerKernelArguments(Function &F, const TargetMachine &TM)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file contains the simple types necessary to represent the attributes associated with functions a...
AMD GCN specific subclass of TargetSubtarget.
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Target-Independent Code Generator Pass Configuration Options pass.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
an instruction to allocate memory on the stack
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
A container for analyses that lazily runs them and caches their results.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
This class represents an incoming formal argument to a Function.
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
static Attribute getWithDereferenceableBytes(LLVMContext &Context, uint64_t Bytes)
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
InstListType::iterator iterator
Instruction iterators...
Represents analyses that only rely on functions' control flow.
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
This class represents a function call, abstracting a target machine's calling convention.
This class represents a range of values.
const APInt & getLower() const
Return the lower value for this range.
const APInt & getUpper() const
Return the upper value for this range.
A parsed version of the target data layout string in and methods for querying it.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
void splice(Function::iterator ToIt, Function *FromF)
Transfer all blocks from FromF to this function at ToIt.
AttributeList getAttributes() const
Return the attribute list for this Function.
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
void setIsNewDbgInfoFormat(bool NewVal)
Argument * getArg(unsigned i) const
void copyAttributesFrom(const Function *Src)
copyAttributesFrom - copy all additional attributes (those not needed to create a Function) from the ...
unsigned getNumFreeUserSGPRs()
void copyMetadata(const GlobalObject *Src, unsigned Offset)
Copy metadata from Src, adjusting offsets by Offset.
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Value * CreateConstInBoundsGEP1_64(Type *Ty, Value *Ptr, uint64_t Idx0, const Twine &Name="")
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Class to represent integer types.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
ConstantAsMetadata * createConstant(Constant *C)
Return the given constant as metadata.
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Primary interface to the complete machine description for the target machine.
Target-Independent Code Generator Pass Configuration Options.
The instances of the Type class are immutable: once they are created, they are never changed.
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isAggregateType() const
Return true if the type is an aggregate type.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
void setName(const Twine &Name)
Change the name of the value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
StringRef getName() const
Return a constant reference to the value's name.
void takeName(Value *V)
Transfer the name from V to this value.
const ParentTy * getParent() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ C
The default llvm calling convention, compatible with C.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Function * getDeclarationIfExists(Module *M, ID id, ArrayRef< Type * > Tys, FunctionType *FT=nullptr)
This version supports overloaded intrinsics.
This is an optimization pass for GlobalISel generic memory operations.
bool isAligned(Align Lhs, uint64_t SizeInBytes)
Checks that SizeInBytes is a multiple of the alignment.
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
FunctionPass * createAMDGPULowerKernelArgumentsPass()
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
This struct is a compact representation of a valid (non-zero power of two) alignment.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Function object to check whether the second component of a container supported by std::get (like std:...