18#include "llvm/IR/IntrinsicsAMDGPU.h"
22#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
28class PreloadKernelArgInfo {
32 unsigned NumFreeUserSGPRs;
38 setInitialFreeUserSGPRsCount();
43 void setInitialFreeUserSGPRsCount() {
44 const unsigned MaxUserSGPRs = ST.getMaxNumUserSGPRs();
50 bool tryAllocPreloadSGPRs(
unsigned AllocSize,
uint64_t ArgOffset,
58 unsigned Padding = ArgOffset - LastExplicitArgOffset;
59 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
60 unsigned NumPreloadSGPRs =
alignTo(AllocSize, 4) / 4;
61 if (NumPreloadSGPRs + PaddingSGPRs > NumFreeUserSGPRs)
64 NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
89 AllocaInst *AI = dyn_cast<AllocaInst>(&*InsPt);
111 const Align KernArgBaseAlign(16);
112 const uint64_t BaseOffset = ST.getExplicitKernelArgOffset();
116 const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(
F, MaxAlign);
117 if (TotalKernArgSize == 0)
122 nullptr,
F.getName() +
".kernarg.segment");
123 KernArgSegment->
addRetAttr(Attribute::NonNull);
129 bool InPreloadSequence =
true;
130 PreloadKernelArgInfo PreloadInfo(
F, ST);
133 const bool IsByRef = Arg.hasByRefAttr();
134 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
135 MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;
136 Align ABITypeAlign =
DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);
139 uint64_t AllocSize =
DL.getTypeAllocSize(ArgTy);
141 uint64_t EltOffset =
alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset;
142 uint64_t LastExplicitArgOffset = ExplicitArgOffset;
143 ExplicitArgOffset =
alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
146 if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() &&
147 !Arg.getType()->isAggregateType())
148 if (PreloadInfo.tryAllocPreloadSGPRs(AllocSize, EltOffset,
149 LastExplicitArgOffset))
152 InPreloadSequence =
false;
161 Builder.
getInt8Ty(), KernArgSegment, EltOffset,
162 Arg.getName() +
".byval.kernarg.offset");
164 Value *CastOffsetPtr =
170 if (
PointerType *PT = dyn_cast<PointerType>(ArgTy)) {
177 !ST.hasUsableDSOffset())
182 if (Arg.hasNoAliasAttr())
186 auto *VT = dyn_cast<FixedVectorType>(ArgTy);
187 bool IsV3 = VT && VT->getNumElements() == 3;
192 int64_t AlignDownOffset =
alignDown(EltOffset, 4);
193 int64_t OffsetDiff = EltOffset - AlignDownOffset;
195 KernArgBaseAlign, DoShiftOpt ? AlignDownOffset : EltOffset);
208 Builder.
getInt8Ty(), KernArgSegment, AlignDownOffset,
209 Arg.
getName() +
".kernarg.offset.align.down");
213 Builder.
getInt8Ty(), KernArgSegment, EltOffset,
214 Arg.getName() +
".kernarg.offset");
215 AdjustedArgTy = ArgTy;
218 if (IsV3 &&
Size >= 32) {
221 AdjustedArgTy = V4Ty;
226 Load->setMetadata(LLVMContext::MD_invariant_load,
MDNode::get(Ctx, {}));
230 if (isa<PointerType>(ArgTy)) {
231 if (Arg.hasNonNullAttr())
232 Load->setMetadata(LLVMContext::MD_nonnull,
MDNode::get(Ctx, {}));
234 uint64_t DerefBytes = Arg.getDereferenceableBytes();
235 if (DerefBytes != 0) {
237 LLVMContext::MD_dereferenceable,
240 ConstantInt::get(Builder.
getInt64Ty(), DerefBytes))));
243 uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();
244 if (DerefOrNullBytes != 0) {
246 LLVMContext::MD_dereferenceable_or_null,
249 DerefOrNullBytes))));
252 if (
MaybeAlign ParamAlign = Arg.getParamAlign()) {
254 LLVMContext::MD_align,
256 Builder.
getInt64Ty(), ParamAlign->value()))));
263 Value *ExtractBits = OffsetDiff == 0 ?
264 Load : Builder.
CreateLShr(Load, OffsetDiff * 8);
269 Arg.getName() +
".load");
273 Arg.getName() +
".load");
276 Load->setName(Arg.getName() +
".load");
277 Arg.replaceAllUsesWith(Load);
287bool AMDGPULowerKernelArguments::runOnFunction(
Function &
F) {
288 auto &TPC = getAnalysis<TargetPassConfig>();
294 "AMDGPU Lower Kernel Arguments",
false,
false)
298char AMDGPULowerKernelArguments::
ID = 0;
301 return new AMDGPULowerKernelArguments();
AMDGPU Lower Kernel Arguments
static BasicBlock::iterator getInsertPt(BasicBlock &BB)
static bool lowerKernelArguments(Function &F, const TargetMachine &TM)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
AMD GCN specific subclass of TargetSubtarget.
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Target-Independent Code Generator Pass Configuration Options pass.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
an instruction to allocate memory on the stack
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
A container for analyses that lazily runs them and caches their results.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
This class represents an incoming formal argument to a Function.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
static Attribute getWithDereferenceableBytes(LLVMContext &Context, uint64_t Bytes)
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
InstListType::iterator iterator
Instruction iterators...
Represents analyses that only rely on functions' control flow.
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
This class represents a function call, abstracting a target machine's calling convention.
A parsed version of the target data layout string in and methods for querying it.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
unsigned getNumUsedUserSGPRs() const
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Value * CreateConstInBoundsGEP1_64(Type *Ty, Value *Ptr, uint64_t Idx0, const Twine &Name="")
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Class to represent integer types.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
ConstantAsMetadata * createConstant(Constant *C)
Return the given constant as metadata.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Primary interface to the complete machine description for the target machine.
Target-Independent Code Generator Pass Configuration Options.
The instances of the Type class are immutable: once they are created, they are never changed.
bool isAggregateType() const
Return true if the type is an aggregate type.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
StringRef getName() const
Return a constant reference to the value's name.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
This is an optimization pass for GlobalISel generic memory operations.
bool isAligned(Align Lhs, uint64_t SizeInBytes)
Checks that SizeInBytes is a multiple of the alignment.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
FunctionPass * createAMDGPULowerKernelArgumentsPass()
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
This struct is a compact representation of a valid (non-zero power of two) alignment.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.