24 #define DEBUG_TYPE "amdgpu-promote-alloca"
36 MDNode *MaxWorkGroupSizeRange;
45 std::pair<Value *, Value *> getLocalSizeYZ(
IRBuilder<> &Builder);
50 bool collectUsesWithPtrTypes(
Value *BaseAlloca,
52 std::vector<Value*> &WorkList)
const;
58 bool binaryOpIsDerivedFromSameAlloca(
Value *Alloca,
Value *Val,
60 int OpIdx0,
int OpIdx1)
const;
70 MaxWorkGroupSizeRange(
nullptr),
72 CurrentLocalMemUsage(0),
76 bool doInitialization(
Module &M)
override;
79 StringRef getPassName()
const override {
return "AMDGPU Promote Alloca"; }
94 "AMDGPU promote alloca to vector or LDS",
false,
false)
99 bool AMDGPUPromoteAlloca::doInitialization(
Module &M) {
104 DL = &Mod->getDataLayout();
113 const Triple &TT =
TM->getTargetTriple();
121 bool AMDGPUPromoteAlloca::runOnFunction(
Function &
F) {
122 if (!
TM || skipFunction(F))
138 DEBUG(
dbgs() <<
"Function has local memory argument. Promoting to "
139 "local memory disabled.\n");
145 if (LocalMemLimit == 0)
151 CurrentLocalMemUsage = 0;
162 unsigned Align = GV.getAlignment();
171 CurrentLocalMemUsage =
alignTo(CurrentLocalMemUsage, Align);
172 CurrentLocalMemUsage += AllocSize;
186 if (OccupancyHint == 0)
194 MaxOccupancy =
std::min(OccupancyHint, MaxOccupancy);
198 unsigned MaxSizeWithWaveCount
202 if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
205 LocalMemLimit = MaxSizeWithWaveCount;
208 dbgs() << F.
getName() <<
" uses " << CurrentLocalMemUsage <<
" bytes of LDS\n"
209 <<
" Rounding size to " << MaxSizeWithWaveCount
210 <<
" with a maximum occupancy of " << MaxOccupancy <<
'\n'
211 <<
" and " << (LocalMemLimit - CurrentLocalMemUsage)
212 <<
" available for promotion\n"
216 for (
auto I = EntryBB.
begin(),
E = EntryBB.
end();
I !=
E; ) {
227 std::pair<Value *, Value *>
228 AMDGPUPromoteAlloca::getLocalSizeYZ(
IRBuilder<> &Builder) {
241 return std::make_pair(LocalSizeY, LocalSizeZ);
309 return std::make_pair(Y, LoadZU);
317 IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x
318 : Intrinsic::r600_read_tidig_x;
321 IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y
322 : Intrinsic::r600_read_tidig_y;
326 IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z
327 : Intrinsic::r600_read_tidig_z;
347 const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
350 auto I = GEPIdx.find(GEP);
351 return I == GEPIdx.end() ?
nullptr : I->second;
373 case Instruction::BitCast:
374 case Instruction::AddrSpaceCast:
389 DEBUG(
dbgs() <<
"Alloca candidate for vectorization\n");
397 DEBUG(
dbgs() <<
" Cannot convert type to vector\n");
401 std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
402 std::vector<Value*> WorkList;
403 for (
User *AllocaUser : Alloca->
users()) {
409 WorkList.push_back(AllocaUser);
418 DEBUG(
dbgs() <<
" Cannot compute vector index for GEP " << *GEP <<
'\n');
422 GEPVectorIdx[
GEP] = Index;
423 for (
User *GEPUser : AllocaUser->
users()) {
427 WorkList.push_back(GEPUser);
433 DEBUG(
dbgs() <<
" Converting alloca to vector "
434 << *AllocaTy <<
" -> " << *VectorTy <<
'\n');
436 for (
Value *V : WorkList) {
445 Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
446 Value *VecValue = Builder.CreateLoad(BitCast);
447 Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
457 Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
458 Value *VecValue = Builder.CreateLoad(BitCast);
459 Value *NewVecValue = Builder.CreateInsertElement(VecValue,
462 Builder.CreateStore(NewVecValue, BitCast);
466 case Instruction::BitCast:
467 case Instruction::AddrSpaceCast:
483 case Intrinsic::memcpy:
484 case Intrinsic::memmove:
485 case Intrinsic::memset:
486 case Intrinsic::lifetime_start:
487 case Intrinsic::lifetime_end:
488 case Intrinsic::invariant_start:
489 case Intrinsic::invariant_end:
490 case Intrinsic::invariant_group_barrier:
491 case Intrinsic::objectsize:
498 bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(
Value *BaseAlloca,
508 if (isa<ConstantPointerNull>(OtherOp))
512 if (!isa<AllocaInst>(OtherObj))
521 if (OtherObj != BaseAlloca) {
522 DEBUG(
dbgs() <<
"Found a binary instruction with another alloca object\n");
529 bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
532 std::vector<Value*> &WorkList)
const {
542 WorkList.push_back(
User);
547 if (UseInst->
getOpcode() == Instruction::PtrToInt)
550 if (
LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
551 if (LI->isVolatile())
557 if (
StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
558 if (
SI->isVolatile())
562 if (
SI->getPointerOperand() != Val)
564 }
else if (
AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
565 if (RMW->isVolatile())
568 if (CAS->isVolatile())
574 if (
ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
575 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
579 WorkList.push_back(ICmp);
582 if (UseInst->
getOpcode() == Instruction::AddrSpaceCast) {
584 WorkList.push_back(
User);
594 if (!
GEP->isInBounds())
600 if (
SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
601 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
606 if (
PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
609 switch (Phi->getNumIncomingValues()) {
613 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
621 WorkList.push_back(
User);
622 if (!collectUsesWithPtrTypes(BaseAlloca,
User, WorkList))
630 void AMDGPUPromoteAlloca::handleAlloca(
AllocaInst &
I) {
641 DEBUG(
dbgs() <<
"Trying to promote " << I <<
'\n');
644 DEBUG(
dbgs() <<
" alloca is not a candidate for vectorization.\n");
676 NewSize += AllocSize;
678 if (NewSize > LocalMemLimit) {
680 <<
" bytes of local memory not available to promote\n");
684 CurrentLocalMemUsage = NewSize;
686 std::vector<Value*> WorkList;
688 if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
689 DEBUG(
dbgs() <<
" Do not know how to convert all uses\n");
693 DEBUG(
dbgs() <<
"Promoting alloca to local memory\n");
708 Value *TCntY, *TCntZ;
710 std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
711 Value *TIdX = getWorkitemID(Builder, 0);
712 Value *TIdY = getWorkitemID(Builder, 1);
713 Value *TIdZ = getWorkitemID(Builder, 2);
731 for (
Value *V : WorkList) {
734 if (
ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
739 if (isa<ConstantPointerNull>(CI->
getOperand(0)))
742 if (isa<ConstantPointerNull>(CI->
getOperand(1)))
750 if (isa<AddrSpaceCastInst>(V))
758 V->mutateType(NewTy);
761 if (
SelectInst *SI = dyn_cast<SelectInst>(V)) {
762 if (isa<ConstantPointerNull>(
SI->getOperand(1)))
765 if (isa<ConstantPointerNull>(
SI->getOperand(2)))
767 }
else if (
PHINode *Phi = dyn_cast<PHINode>(V)) {
768 for (
unsigned I = 0,
E = Phi->getNumIncomingValues(); I !=
E; ++
I) {
769 if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
780 case Intrinsic::lifetime_start:
781 case Intrinsic::lifetime_end:
785 case Intrinsic::memcpy: {
793 case Intrinsic::memmove: {
801 case Intrinsic::memset: {
809 case Intrinsic::invariant_start:
810 case Intrinsic::invariant_end:
811 case Intrinsic::invariant_group_barrier:
817 case Intrinsic::objectsize: {
821 Intrinsic::objectsize,
839 return new AMDGPUPromoteAlloca(TM);
unsigned getAlignment() const
OSType getOS() const
getOS - Get the parsed operating system type of this triple.
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
A parsed version of the target data layout string in and methods for querying it. ...
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
AMDGPU specific subclass of TargetSubtarget.
A Module instance is used to store all the information related to an LLVM module. ...
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
an instruction that atomically checks whether a specified value is in a memory location, and, if it is, stores a new value there.
Value * getValue() const
Return the arguments to the instruction.
unsigned getNumOperands() const
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount...
This class represents a function call, abstracting a target machine's calling convention.
ArrayRef< Type * > params() const
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space...
FunctionPass * createAMDGPUPromoteAlloca(const TargetMachine *TM=nullptr)
This class wraps the llvm.memset intrinsic.
const Function * getParent() const
Return the enclosing method, or null if none.
The two locations do not alias at all.
uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the next integer (mod 2**64) that is greater than or equal to Value and is a multiple of Alig...
CallInst * CreateMemSet(Value *Ptr, Value *Val, uint64_t Size, unsigned Align, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memset to the specified pointer and the specified value.
unsigned getAddressSpace() const
Return the address space of the Pointer type.
An instruction for reading from memory.
int getLocalMemorySize() const
Address space for local memory.
an instruction that atomically reads a memory location, combines it with another value, and then stores the result back.
Type * getPointerElementType() const
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
uint64_t getArrayNumElements() const
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
StringRef getName() const
Return a constant reference to the value's name.
iterator begin()
Instruction iterator methods.
bool isArrayAllocation() const
Return true if there is an allocation size parameter to the allocation instruction that is not 1...
This class represents the LLVM 'select' instruction.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
This class wraps the llvm.memmove intrinsic.
Type * getArrayElementType() const
A Use represents the edge between a Value definition and its users.
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Class to represent function types.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Class to represent array types.
CallInst * CreateMemMove(Value *Dst, Value *Src, uint64_t Size, unsigned Align, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memmove between the specified pointers.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
ArchType getArch() const
getArch - Get the parsed architecture type of this triple.
An instruction for storing to memory.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Value * CreateInBoundsGEP(Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Type * getElementType() const
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block...
Maximum length of the test input libFuzzer tries to guess a good value based on the corpus and reports it always prefer smaller inputs during the corpus shuffle When libFuzzer itself reports a bug this exit code will be used If indicates the maximal total time in seconds to run the fuzzer minimizes the provided crash input Use with etc Experimental Use value profile to guide fuzzing Number of simultaneous worker processes to run the jobs If min(jobs, NumberOfCpuCores()/2)\" is used.") FUZZER_FLAG_INT(reload
Class to represent pointers.
static GCRegistry::Add< CoreCLRGC > E("coreclr","CoreCLR-compatible GC")
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
static ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
unsigned getMaxWavesPerEU() const
bool isShader(CallingConv::ID cc)
LLVM Basic Block Representation.
The instances of the Type class are immutable: once they are created, they are never changed...
Address space for constant memory (VTX2)
bool isVectorTy() const
True if this is an instance of VectorType.
Address space for private memory.
unsigned getAlignment() const
Return the alignment of the memory that is being allocated by the instruction.
Value * getRawDest() const
Represent the analysis usage information of a pass.
This instruction compares its operands according to the predicate given to the constructor.
uint64_t getNumElements() const
FunctionPass class - This class is used to implement most global optimizations.
Value * getOperand(unsigned i) const
bool isPointerTy() const
True if this is an instance of PointerType.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Value * GetUnderlyingObject(Value *V, const DataLayout &DL, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value...
bool isPromoteAllocaEnabled() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Triple - Helper class for working with autoconf configuration names.
void dump() const
Support for debugging, callable in GDB: V->dump()
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const
Inverse of getMaxLocalMemWithWaveCount.
#define INITIALIZE_TM_PASS(passName, arg, name, cfg, analysis)
This initializer registers TargetMachine constructor, so the pass being initialized can use target de...
unsigned getABITypeAlignment(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
This is the shared class of boolean and integer constants.
uint64_t getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size...
Type * getType() const
All values are typed, get the type of this value.
Value * getLength() const
This class wraps the llvm.memcpy intrinsic.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Value * CreateConstInBoundsGEP1_64(Value *Ptr, uint64_t Idx0, const Twine &Name="")
void setOperand(unsigned i, Value *Val)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Class to represent vector types.
Class for arbitrary precision integers.
iterator_range< user_iterator > users()
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
CallInst * CreateMemCpy(Value *Dst, Value *Src, uint64_t Size, unsigned Align, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *TBAAStructTag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memcpy between the specified pointers.
void setUnnamedAddr(UnnamedAddr Val)
static IntegerType * getInt32Ty(LLVMContext &C)
FunctionType * getFunctionType() const
Returns the FunctionType for me.
CallInst * CreateCall(Value *Callee, ArrayRef< Value * > Args=None, const Twine &Name="", MDNode *FPMathTag=nullptr)
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Rename collisions when linking (static functions).
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Value * getRawSource() const
Return the arguments to the instruction.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LoadInst * CreateAlignedLoad(Value *Ptr, unsigned Align, const char *Name)
LLVM Value Representation.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
static VectorType * get(Type *ElementType, unsigned NumElements)
This static method is the primary way to construct an VectorType.
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Primary interface to the complete machine description for the target machine.
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
char & AMDGPUPromoteAllocaID
StringRef - Represent a constant reference to a string, i.e.
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
void addAttribute(unsigned i, Attribute::AttrKind Kind)
adds the attribute to the list of attributes.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml","ocaml 3.10-compatible collector")
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Value * getPointerOperand()
const BasicBlock * getParent() const
void addDereferenceableAttr(unsigned i, uint64_t Bytes)
adds the dereferenceable attribute to the list of attributes.
A wrapper class for inspecting calls to intrinsic functions.
an instruction to allocate memory on the stack
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.