40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
48#define DEBUG_TYPE "amdgpu-promote-alloca"
55 DisablePromoteAllocaToVector(
"disable-promote-alloca-to-vector",
56 cl::desc(
"Disable promote alloca to vector"),
60 DisablePromoteAllocaToLDS(
"disable-promote-alloca-to-lds",
61 cl::desc(
"Disable promote alloca to LDS"),
65 "amdgpu-promote-alloca-to-vector-limit",
66 cl::desc(
"Maximum byte size to consider promote alloca to vector"),
70 "amdgpu-promote-alloca-to-vector-max-regs",
72 "Maximum vector size (in 32b registers) to use when promoting alloca"),
78 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
79 cl::desc(
"Ratio of VGPRs to budget for promoting alloca to vectors"),
83 LoopUserWeight(
"promote-alloca-vector-loop-user-weight",
84 cl::desc(
"The bonus weight of users of allocas within loop "
85 "when sorting profitable allocas"),
90struct GEPToVectorIndex {
91 Value *VarIndex =
nullptr;
97struct MemTransferInfo {
103struct AllocaAnalysis {
108 bool HaveSelectOrPHI =
false;
121 explicit AllocaAnalysis(
AllocaInst *Alloca) : Alloca(Alloca) {}
125class AMDGPUPromoteAllocaImpl {
136 unsigned VGPRBudgetRatio;
137 unsigned MaxVectorRegs;
139 bool IsAMDGCN =
false;
140 bool IsAMDHSA =
false;
142 std::pair<Value *, Value *> getLocalSizeYZ(
IRBuilder<> &Builder);
145 bool collectAllocaUses(AllocaAnalysis &
AA)
const;
151 bool binaryOpIsDerivedFromSameAlloca(
Value *Alloca,
Value *Val,
156 bool hasSufficientLocalMem(
const Function &
F);
159 void analyzePromoteToVector(AllocaAnalysis &
AA)
const;
160 void promoteAllocaToVector(AllocaAnalysis &
AA);
161 void analyzePromoteToLDS(AllocaAnalysis &
AA)
const;
162 bool tryPromoteAllocaToLDS(AllocaAnalysis &
AA,
bool SufficientLDS,
167 void scoreAlloca(AllocaAnalysis &
AA)
const;
169 void setFunctionLimits(
const Function &
F);
175 IsAMDGCN = TT.isAMDGCN();
179 bool run(
Function &
F,
bool PromoteToLDS);
192 if (
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
193 return AMDGPUPromoteAllocaImpl(
195 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
200 StringRef getPassName()
const override {
return "AMDGPU Promote Alloca"; }
209static unsigned getMaxVGPRs(
unsigned LDSBytes,
const TargetMachine &TM,
219 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
220 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
222 unsigned MaxVGPRs = ST.getMaxNumVGPRs(
223 ST.getWavesPerEU(ST.getFlatWorkGroupSizes(
F), LDSBytes,
F).first,
224 DynamicVGPRBlockSize);
229 if (!
F.hasFnAttribute(Attribute::AlwaysInline) &&
231 MaxVGPRs = std::min(MaxVGPRs, 32u);
237char AMDGPUPromoteAlloca::ID = 0;
240 "AMDGPU promote alloca to vector or LDS",
false,
false)
253 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(
F,
true);
265 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(
F,
false);
275 return new AMDGPUPromoteAlloca();
278bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &
AA)
const {
281 <<
" " << *Inst <<
"\n");
286 while (!WorkList.empty()) {
287 auto *Cur = WorkList.pop_back_val();
288 if (
find(
AA.Pointers, Cur) !=
AA.Pointers.end())
290 AA.Pointers.insert(Cur);
291 for (
auto &U : Cur->uses()) {
295 return RejectUser(Inst,
"pointer escapes via store");
298 AA.Uses.push_back(&U);
301 WorkList.push_back(Inst);
305 if (!binaryOpIsDerivedFromSameAlloca(
AA.Alloca, Cur,
SI, 1, 2))
306 return RejectUser(Inst,
"select from mixed objects");
307 WorkList.push_back(Inst);
308 AA.HaveSelectOrPHI =
true;
314 switch (
Phi->getNumIncomingValues()) {
318 if (!binaryOpIsDerivedFromSameAlloca(
AA.Alloca, Cur, Phi, 0, 1))
319 return RejectUser(Inst,
"phi from mixed objects");
322 return RejectUser(Inst,
"phi with too many operands");
325 WorkList.push_back(Inst);
326 AA.HaveSelectOrPHI =
true;
333void AMDGPUPromoteAllocaImpl::scoreAlloca(AllocaAnalysis &
AA)
const {
337 for (
auto *U :
AA.Uses) {
343 1 + (LoopUserWeight * LI.getLoopDepth(Inst->
getParent()));
344 LLVM_DEBUG(
dbgs() <<
" [+" << UserScore <<
"]:\t" << *Inst <<
"\n");
351void AMDGPUPromoteAllocaImpl::setFunctionLimits(
const Function &
F) {
355 const int R600MaxVectorRegs = 16;
356 MaxVectorRegs =
F.getFnAttributeAsParsedInteger(
357 "amdgpu-promote-alloca-to-vector-max-regs",
358 IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs);
359 if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
360 MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
361 VGPRBudgetRatio =
F.getFnAttributeAsParsedInteger(
362 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
363 PromoteAllocaToVectorVGPRRatio);
364 if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
365 VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
368bool AMDGPUPromoteAllocaImpl::run(
Function &
F,
bool PromoteToLDS) {
370 DL = &
Mod->getDataLayout();
373 if (!
ST.isPromoteAllocaEnabled())
376 bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(
F);
377 MaxVGPRs = getMaxVGPRs(CurrentLocalMemUsage, TM,
F);
378 setFunctionLimits(
F);
380 unsigned VectorizationBudget =
381 (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
385 std::vector<AllocaAnalysis> Allocas;
390 if (!AI->isStaticAlloca() || AI->isArrayAllocation())
395 AllocaAnalysis
AA{AI};
396 if (collectAllocaUses(
AA)) {
397 analyzePromoteToVector(
AA);
399 analyzePromoteToLDS(
AA);
400 if (
AA.Vector.Ty ||
AA.LDS.Enable) {
402 Allocas.push_back(std::move(
AA));
409 [](
const auto &
A,
const auto &
B) {
return A.Score >
B.Score; });
413 dbgs() <<
"Sorted Worklist:\n";
414 for (
const auto &
AA : Allocas)
415 dbgs() <<
" " << *
AA.Alloca <<
"\n";
421 for (AllocaAnalysis &
AA : Allocas) {
423 const unsigned AllocaCost =
424 DL->getTypeSizeInBits(
AA.Alloca->getAllocatedType());
426 if (AllocaCost <= VectorizationBudget) {
427 promoteAllocaToVector(
AA);
429 assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
431 VectorizationBudget -= AllocaCost;
433 << VectorizationBudget <<
"\n");
437 << AllocaCost <<
", budget:" << VectorizationBudget
438 <<
"): " << *
AA.Alloca <<
"\n");
443 tryPromoteAllocaToLDS(
AA, SufficientLDS, DeferredIntrs))
446 finishDeferredAllocaToLDSPromotion(DeferredIntrs);
468 return I->getOperand(0) == AI &&
476 if (Ptr ==
AA.Alloca)
477 return B.getInt32(0);
480 auto I =
AA.Vector.GEPVectorIdx.find(
GEP);
481 assert(
I !=
AA.Vector.GEPVectorIdx.end() &&
"Must have entry for GEP!");
483 if (!
I->second.Full) {
484 Value *Result =
nullptr;
485 B.SetInsertPoint(
GEP);
487 if (
I->second.VarIndex) {
488 Result =
I->second.VarIndex;
489 Result =
B.CreateSExtOrTrunc(Result,
B.getInt32Ty());
491 if (
I->second.VarMul)
492 Result =
B.CreateMul(Result,
I->second.VarMul);
495 if (
I->second.ConstIndex) {
497 Result =
B.CreateAdd(Result,
I->second.ConstIndex);
499 Result =
I->second.ConstIndex;
503 Result =
B.getInt32(0);
505 I->second.Full = Result;
508 return I->second.Full;
511static std::optional<GEPToVectorIndex>
517 unsigned BW =
DL.getIndexTypeSizeInBits(
GEP->getType());
519 APInt ConstOffset(BW, 0);
540 if (!CurGEP->collectOffset(
DL, BW, VarOffsets, ConstOffset))
544 CurPtr = CurGEP->getPointerOperand();
547 assert(CurPtr == Alloca &&
"GEP not based on alloca");
549 int64_t VecElemSize =
DL.getTypeAllocSize(VecElemTy);
550 if (VarOffsets.
size() > 1)
559 GEPToVectorIndex Result;
561 if (!ConstOffset.
isZero())
562 Result.ConstIndex = ConstantInt::get(Ctx, IndexQuot.
sextOrTrunc(BW));
564 if (VarOffsets.
empty())
567 const auto &VarOffset = VarOffsets.
front();
570 if (Rem != 0 || OffsetQuot.
isZero())
573 Result.VarIndex = VarOffset.first;
578 if (!OffsetQuot.
isOne())
579 Result.VarMul = ConstantInt::get(Ctx, OffsetQuot.
sextOrTrunc(BW));
599 unsigned VecStoreSize,
600 unsigned ElementSize,
606 Builder.SetInsertPoint(Inst);
608 const auto CreateTempPtrIntCast = [&Builder,
DL](
Value *Val,
610 assert(
DL.getTypeStoreSize(Val->getType()) ==
DL.getTypeStoreSize(PtrTy));
611 const unsigned Size =
DL.getTypeStoreSizeInBits(PtrTy);
612 if (!PtrTy->isVectorTy())
613 return Builder.CreateBitOrPointerCast(Val, Builder.getIntNTy(
Size));
617 assert((
Size % NumPtrElts == 0) &&
"Vector size not divisble");
619 return Builder.CreateBitOrPointerCast(
623 Type *VecEltTy =
AA.Vector.Ty->getElementType();
626 case Instruction::Load: {
627 Value *CurVal = GetCurVal();
633 TypeSize AccessSize =
DL.getTypeStoreSize(AccessTy);
635 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
637 CurVal = CreateTempPtrIntCast(CurVal, AccessTy);
639 CurVal = CreateTempPtrIntCast(CurVal, CurVal->
getType());
640 Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, AccessTy);
649 const unsigned NumLoadedElts = AccessSize /
DL.getTypeStoreSize(VecEltTy);
651 assert(
DL.getTypeStoreSize(SubVecTy) ==
DL.getTypeStoreSize(AccessTy));
654 for (
unsigned K = 0; K < NumLoadedElts; ++K) {
656 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
657 SubVec = Builder.CreateInsertElement(
658 SubVec, Builder.CreateExtractElement(CurVal, CurIdx), K);
662 SubVec = CreateTempPtrIntCast(SubVec, AccessTy);
663 else if (SubVecTy->isPtrOrPtrVectorTy())
664 SubVec = CreateTempPtrIntCast(SubVec, SubVecTy);
666 SubVec = Builder.CreateBitOrPointerCast(SubVec, AccessTy);
672 Value *ExtractElement = Builder.CreateExtractElement(CurVal, Index);
673 if (AccessTy != VecEltTy)
674 ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, AccessTy);
679 case Instruction::Store: {
686 Value *Val =
SI->getValueOperand();
690 TypeSize AccessSize =
DL.getTypeStoreSize(AccessTy);
692 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
694 Val = CreateTempPtrIntCast(Val, AccessTy);
695 else if (
AA.Vector.Ty->isPtrOrPtrVectorTy())
696 Val = CreateTempPtrIntCast(Val,
AA.Vector.Ty);
697 return Builder.CreateBitOrPointerCast(Val,
AA.Vector.Ty);
704 const unsigned NumWrittenElts =
705 AccessSize /
DL.getTypeStoreSize(VecEltTy);
706 const unsigned NumVecElts =
AA.Vector.Ty->getNumElements();
708 assert(
DL.getTypeStoreSize(SubVecTy) ==
DL.getTypeStoreSize(AccessTy));
710 if (SubVecTy->isPtrOrPtrVectorTy())
711 Val = CreateTempPtrIntCast(Val, SubVecTy);
713 Val = CreateTempPtrIntCast(Val, AccessTy);
715 Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
717 Value *CurVec = GetCurVal();
718 for (
unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
721 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
722 CurVec = Builder.CreateInsertElement(
723 CurVec, Builder.CreateExtractElement(Val, K), CurIdx);
728 if (Val->
getType() != VecEltTy)
729 Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
730 return Builder.CreateInsertElement(GetCurVal(), Val, Index);
732 case Instruction::Call: {
736 unsigned NumCopied =
Length->getZExtValue() / ElementSize;
737 MemTransferInfo *TI = &
AA.Vector.TransferInfo[MTI];
742 for (
unsigned Idx = 0; Idx <
AA.Vector.Ty->getNumElements(); ++Idx) {
743 if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
752 return Builder.CreateShuffleVector(GetCurVal(), Mask);
758 Value *Elt = MSI->getOperand(1);
759 const unsigned BytesPerElt =
DL.getTypeStoreSize(VecEltTy);
760 if (BytesPerElt > 1) {
761 Value *EltBytes = Builder.CreateVectorSplat(BytesPerElt, Elt);
767 Elt = Builder.CreateBitCast(EltBytes, PtrInt);
768 Elt = Builder.CreateIntToPtr(Elt, VecEltTy);
770 Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
773 return Builder.CreateVectorSplat(
AA.Vector.Ty->getElementCount(), Elt);
777 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
778 Intr->replaceAllUsesWith(
779 Builder.getIntN(Intr->getType()->getIntegerBitWidth(),
780 DL.getTypeAllocSize(
AA.Vector.Ty)));
809 TypeSize AccTS =
DL.getTypeStoreSize(AccessTy);
813 if (AccTS * 8 !=
DL.getTypeSizeInBits(AccessTy))
825template <
typename InstContainer>
837 auto &BlockUses = UsesByBlock[BB];
840 if (BlockUses.empty())
844 if (BlockUses.size() == 1) {
851 if (!BlockUses.contains(&Inst))
872AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(
Type *AllocaTy)
const {
873 if (DisablePromoteAllocaToVector) {
880 uint64_t NumElems = 1;
883 NumElems *= ArrayTy->getNumElements();
884 ElemTy = ArrayTy->getElementType();
890 NumElems *= InnerVectorTy->getNumElements();
891 ElemTy = InnerVectorTy->getElementType();
895 unsigned ElementSize =
DL->getTypeSizeInBits(ElemTy) / 8;
896 if (ElementSize > 0) {
897 unsigned AllocaSize =
DL->getTypeStoreSize(AllocaTy);
902 if (NumElems * ElementSize != AllocaSize)
903 NumElems = AllocaSize / ElementSize;
904 if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
914 const unsigned MaxElements =
915 (MaxVectorRegs * 32) /
DL->getTypeSizeInBits(VectorTy->getElementType());
917 if (VectorTy->getNumElements() > MaxElements ||
918 VectorTy->getNumElements() < 2) {
920 <<
" has an unsupported number of elements\n");
924 Type *VecEltTy = VectorTy->getElementType();
925 unsigned ElementSizeInBits =
DL->getTypeSizeInBits(VecEltTy);
926 if (ElementSizeInBits !=
DL->getTypeAllocSizeInBits(VecEltTy)) {
927 LLVM_DEBUG(
dbgs() <<
" Cannot convert to vector if the allocation size "
928 "does not match the type's size\n");
935void AMDGPUPromoteAllocaImpl::analyzePromoteToVector(AllocaAnalysis &
AA)
const {
936 if (
AA.HaveSelectOrPHI) {
937 LLVM_DEBUG(
dbgs() <<
" Cannot convert to vector due to select or phi\n");
941 Type *AllocaTy =
AA.Alloca->getAllocatedType();
942 AA.Vector.Ty = getVectorTypeForAlloca(AllocaTy);
947 LLVM_DEBUG(
dbgs() <<
" Cannot promote alloca to vector: " << Msg <<
"\n"
948 <<
" " << *Inst <<
"\n");
949 AA.Vector.Ty =
nullptr;
952 Type *VecEltTy =
AA.Vector.Ty->getElementType();
953 unsigned ElementSize =
DL->getTypeSizeInBits(VecEltTy) / 8;
955 for (
auto *U :
AA.Uses) {
964 return RejectUser(Inst,
"unsupported load/store as aggregate");
971 return RejectUser(Inst,
"not a simple load or store");
973 Ptr = Ptr->stripPointerCasts();
976 if (Ptr ==
AA.Alloca &&
977 DL->getTypeStoreSize(
AA.Alloca->getAllocatedType()) ==
978 DL->getTypeStoreSize(AccessTy)) {
979 AA.Vector.Worklist.push_back(Inst);
984 return RejectUser(Inst,
"not a supported access type");
986 AA.Vector.Worklist.push_back(Inst);
995 return RejectUser(Inst,
"cannot compute vector index for GEP");
997 AA.Vector.GEPVectorIdx[
GEP] = std::move(
Index.value());
998 AA.Vector.UsersToRemove.push_back(Inst);
1004 AA.Vector.Worklist.push_back(Inst);
1009 if (TransferInst->isVolatile())
1010 return RejectUser(Inst,
"mem transfer inst is volatile");
1013 if (!Len || (
Len->getZExtValue() % ElementSize))
1014 return RejectUser(Inst,
"mem transfer inst length is non-constant or "
1015 "not a multiple of the vector element size");
1018 if (Ptr ==
AA.Alloca)
1019 return ConstantInt::get(Ptr->getContext(),
APInt(32, 0));
1022 const auto &GEPI =
AA.Vector.GEPVectorIdx.find(
GEP)->second;
1025 if (GEPI.ConstIndex)
1026 return GEPI.ConstIndex;
1027 return ConstantInt::get(Ptr->getContext(),
APInt(32, 0));
1030 MemTransferInfo *TI =
1031 &
AA.Vector.TransferInfo.try_emplace(TransferInst).first->second;
1032 unsigned OpNum =
U->getOperandNo();
1034 Value *Dest = TransferInst->getDest();
1037 return RejectUser(Inst,
"could not calculate constant dest index");
1038 TI->DestIndex =
Index;
1041 Value *Src = TransferInst->getSource();
1044 return RejectUser(Inst,
"could not calculate constant src index");
1045 TI->SrcIndex =
Index;
1051 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
1052 AA.Vector.Worklist.push_back(Inst);
1060 return RejectUser(Inst,
"assume-like intrinsic cannot have any users");
1061 AA.Vector.UsersToRemove.push_back(Inst);
1066 return isAssumeLikeIntrinsic(cast<Instruction>(U));
1068 AA.Vector.UsersToRemove.push_back(Inst);
1072 return RejectUser(Inst,
"unhandled alloca user");
1076 for (
const auto &Entry :
AA.Vector.TransferInfo) {
1077 const MemTransferInfo &TI =
Entry.second;
1078 if (!TI.SrcIndex || !TI.DestIndex)
1079 return RejectUser(
Entry.first,
1080 "mem transfer inst between different objects");
1081 AA.Vector.Worklist.push_back(
Entry.first);
1085void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &
AA) {
1087 LLVM_DEBUG(
dbgs() <<
" type conversion: " << *
AA.Alloca->getAllocatedType()
1088 <<
" -> " << *
AA.Vector.Ty <<
'\n');
1089 const unsigned VecStoreSize =
DL->getTypeStoreSize(
AA.Vector.Ty);
1091 Type *VecEltTy =
AA.Vector.Ty->getElementType();
1092 const unsigned ElementSize =
DL->getTypeSizeInBits(VecEltTy) / 8;
1114 BasicBlock *BB = I->getParent();
1115 auto GetCurVal = [&]() -> Value * {
1116 if (Value *CurVal = Updater.FindValueForBlock(BB))
1119 if (!Placeholders.empty() && Placeholders.back()->getParent() == BB)
1120 return Placeholders.back();
1124 IRBuilder<> Builder(I);
1125 auto *Placeholder = cast<Instruction>(Builder.CreateFreeze(
1126 PoisonValue::get(AA.Vector.Ty),
"promotealloca.placeholder"));
1127 Placeholders.push_back(Placeholder);
1128 return Placeholders.back();
1132 ElementSize, GetCurVal);
1139 Placeholder->replaceAllUsesWith(
1141 Placeholder->eraseFromParent();
1147 I->eraseFromParent();
1152 I->dropDroppableUses();
1154 I->eraseFromParent();
1159 AA.Alloca->eraseFromParent();
1162std::pair<Value *, Value *>
1163AMDGPUPromoteAllocaImpl::getLocalSizeYZ(
IRBuilder<> &Builder) {
1173 ST.makeLIDRangeMetadata(LocalSizeY);
1174 ST.makeLIDRangeMetadata(LocalSizeZ);
1176 return std::pair(LocalSizeY, LocalSizeZ);
1217 F.removeFnAttr(
"amdgpu-no-dispatch-ptr");
1234 LoadXY->
setMetadata(LLVMContext::MD_invariant_load, MD);
1235 LoadZU->
setMetadata(LLVMContext::MD_invariant_load, MD);
1236 ST.makeLIDRangeMetadata(LoadZU);
1241 return std::pair(
Y, LoadZU);
1253 IntrID = IsAMDGCN ? (
Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
1255 AttrName =
"amdgpu-no-workitem-id-x";
1258 IntrID = IsAMDGCN ? (
Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
1260 AttrName =
"amdgpu-no-workitem-id-y";
1264 IntrID = IsAMDGCN ? (
Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
1266 AttrName =
"amdgpu-no-workitem-id-z";
1274 ST.makeLIDRangeMetadata(CI);
1275 F->removeFnAttr(AttrName);
1285 switch (
II->getIntrinsicID()) {
1286 case Intrinsic::memcpy:
1287 case Intrinsic::memmove:
1288 case Intrinsic::memset:
1289 case Intrinsic::lifetime_start:
1290 case Intrinsic::lifetime_end:
1291 case Intrinsic::invariant_start:
1292 case Intrinsic::invariant_end:
1293 case Intrinsic::launder_invariant_group:
1294 case Intrinsic::strip_invariant_group:
1295 case Intrinsic::objectsize:
1302bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1324 if (OtherObj != BaseAlloca) {
1326 dbgs() <<
"Found a binary instruction with another alloca object\n");
1333void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &
AA)
const {
1334 if (DisablePromoteAllocaToLDS) {
1342 const Function &ContainingFunction = *
AA.Alloca->getFunction();
1352 <<
" promote alloca to LDS not supported with calling convention.\n");
1363 if (
find(
AA.LDS.Worklist,
User) ==
AA.LDS.Worklist.end())
1364 AA.LDS.Worklist.push_back(
User);
1369 if (UseInst->
getOpcode() == Instruction::PtrToInt)
1373 if (LI->isVolatile())
1379 if (
SI->isVolatile())
1385 if (RMW->isVolatile())
1391 if (CAS->isVolatile())
1399 if (!binaryOpIsDerivedFromSameAlloca(
AA.Alloca,
Use->get(), ICmp, 0, 1))
1403 if (
find(
AA.LDS.Worklist,
User) ==
AA.LDS.Worklist.end())
1404 AA.LDS.Worklist.push_back(ICmp);
1411 if (!
GEP->isInBounds())
1424 if (
find(
AA.LDS.Worklist,
User) ==
AA.LDS.Worklist.end())
1425 AA.LDS.Worklist.push_back(
User);
1428 AA.LDS.Enable =
true;
1431bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(
const Function &
F) {
1439 for (
Type *ParamTy : FTy->params()) {
1443 LLVM_DEBUG(
dbgs() <<
"Function has local memory argument. Promoting to "
1444 "local memory disabled.\n");
1449 LocalMemLimit =
ST.getAddressableLocalMemorySize();
1450 if (LocalMemLimit == 0)
1460 if (
Use->getFunction() == &
F)
1464 if (VisitedConstants.
insert(
C).second)
1476 if (visitUsers(&GV, &GV)) {
1484 while (!
Stack.empty()) {
1486 if (visitUsers(&GV,
C)) {
1507 LLVM_DEBUG(
dbgs() <<
"Function has a reference to externally allocated "
1508 "local memory. Promoting to local memory "
1523 CurrentLocalMemUsage = 0;
1529 for (
auto Alloc : AllocatedSizes) {
1530 CurrentLocalMemUsage =
alignTo(CurrentLocalMemUsage,
Alloc.second);
1531 CurrentLocalMemUsage +=
Alloc.first;
1534 unsigned MaxOccupancy =
1535 ST.getWavesPerEU(
ST.getFlatWorkGroupSizes(
F), CurrentLocalMemUsage,
F)
1539 unsigned MaxSizeWithWaveCount =
1540 ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy,
F);
1543 if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
1546 LocalMemLimit = MaxSizeWithWaveCount;
1549 <<
" bytes of LDS\n"
1550 <<
" Rounding size to " << MaxSizeWithWaveCount
1551 <<
" with a maximum occupancy of " << MaxOccupancy <<
'\n'
1552 <<
" and " << (LocalMemLimit - CurrentLocalMemUsage)
1553 <<
" available for promotion\n");
1559bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(
1560 AllocaAnalysis &
AA,
bool SufficientLDS,
1571 const Function &ContainingFunction = *
AA.Alloca->getParent()->getParent();
1573 unsigned WorkGroupSize =
ST.getFlatWorkGroupSizes(ContainingFunction).second;
1575 Align Alignment =
DL.getValueOrABITypeAlignment(
1576 AA.Alloca->getAlign(),
AA.Alloca->getAllocatedType());
1584 uint32_t NewSize =
alignTo(CurrentLocalMemUsage, Alignment);
1585 uint32_t AllocSize =
1586 WorkGroupSize *
DL.getTypeAllocSize(
AA.Alloca->getAllocatedType());
1587 NewSize += AllocSize;
1589 if (NewSize > LocalMemLimit) {
1591 <<
" bytes of local memory not available to promote\n");
1595 CurrentLocalMemUsage = NewSize;
1604 Twine(
F->getName()) +
Twine(
'.') +
AA.Alloca->getName(),
nullptr,
1609 Value *TCntY, *TCntZ;
1611 std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
1612 Value *TIdX = getWorkitemID(Builder, 0);
1613 Value *TIdY = getWorkitemID(Builder, 1);
1614 Value *TIdZ = getWorkitemID(Builder, 2);
1626 AA.Alloca->mutateType(
Offset->getType());
1627 AA.Alloca->replaceAllUsesWith(
Offset);
1628 AA.Alloca->eraseFromParent();
1632 for (
Value *V :
AA.LDS.Worklist) {
1654 assert(
V->getType()->isPtrOrPtrVectorTy());
1656 Type *NewTy =
V->getType()->getWithNewType(NewPtrTy);
1657 V->mutateType(NewTy);
1667 for (
unsigned I = 0,
E =
Phi->getNumIncomingValues();
I !=
E; ++
I) {
1669 Phi->getIncomingValue(
I)))
1680 case Intrinsic::lifetime_start:
1681 case Intrinsic::lifetime_end:
1685 case Intrinsic::memcpy:
1686 case Intrinsic::memmove:
1690 DeferredIntrs.
insert(Intr);
1692 case Intrinsic::memset: {
1700 case Intrinsic::invariant_start:
1701 case Intrinsic::invariant_end:
1702 case Intrinsic::launder_invariant_group:
1703 case Intrinsic::strip_invariant_group: {
1721 case Intrinsic::objectsize: {
1725 Intrinsic::objectsize,
1741void AMDGPUPromoteAllocaImpl::finishDeferredAllocaToLDSPromotion(
1748 assert(
ID == Intrinsic::memcpy ||
ID == Intrinsic::memmove);
1752 ID,
MI->getRawDest(),
MI->getDestAlign(),
MI->getRawSource(),
1753 MI->getSourceAlign(),
MI->getLength(),
MI->isVolatile());
1755 for (
unsigned I = 0;
I != 2; ++
I) {
1757 B->addDereferenceableParamAttr(
I, Bytes);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool runOnFunction(Function &F, bool PostInlining)
AMD GCN specific subclass of TargetSubtarget.
uint64_t IntrinsicInst * II
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Remove Loads Into Fake Uses
static unsigned getNumElements(Type *Ty)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
Target-Independent Code Generator Pass Configuration Options pass.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Class for arbitrary precision integers.
static LLVM_ABI void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
bool isOne() const
Determine if this is a value of 1.
an instruction to allocate memory on the stack
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
LLVM Basic Block Representation.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
Represents analyses that only rely on functions' control flow.
uint64_t getParamDereferenceableBytes(unsigned i) const
Extract the number of dereferenceable bytes for a call or parameter (0=unknown).
void addDereferenceableRetAttr(uint64_t Bytes)
adds the dereferenceable attribute to the list of attributes.
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
Value * getArgOperand(unsigned i) const
This class represents a function call, abstracting a target machine's calling convention.
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
This is an important base class in LLVM.
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
Class to represent fixed width SIMD vectors.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
Class to represent function types.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
bool hasExternalLinkage() const
void setUnnamedAddr(UnnamedAddr Val)
unsigned getAddressSpace() const
@ InternalLinkage
Rename collisions when linking (static functions).
Type * getValueType() const
MaybeAlign getAlign() const
Returns the alignment of the given variable.
void setAlignment(Align Align)
Sets the alignment attribute of the GlobalVariable.
This instruction compares its operands according to the predicate given to the constructor.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
BasicBlock * GetInsertBlock() const
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
CallInst * CreateMemSet(Value *Ptr, Value *Val, uint64_t Size, MaybeAlign Align, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert a memset to the specified pointer and the specified value.
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateConstInBoundsGEP1_64(Type *Ty, Value *Ptr, uint64_t Idx0, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
LLVM_ABI CallInst * CreateMemTransferInst(Intrinsic::ID IntrID, Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, Value *Size, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
Analysis pass that exposes the LoopInfo for a function.
The legacy pass manager's analysis pass to compute loop information.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
This class implements a map that also provides access to all stored values in a deterministic order.
std::pair< KeyT, ValueT > & front()
Value * getLength() const
Value * getRawDest() const
MaybeAlign getDestAlign() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
This class wraps the llvm.memcpy/memmove intrinsics.
A Module instance is used to store all the information related to an LLVM module.
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Class to represent pointers.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Helper class for SSA formation on a set of values defined in multiple blocks.
void Initialize(Type *Ty, StringRef Name)
Reset this object to get ready for a new set of SSA updates with type 'Ty'.
Value * GetValueInMiddleOfBlock(BasicBlock *BB)
Construct SSA form, materializing a value that is live in the middle of the specified block.
void AddAvailableValue(BasicBlock *BB, Value *V)
Indicate that a rewritten value is available in the specified block with the specified value.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
bool insert(const value_type &X)
Insert a new element into the SetVector.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
static unsigned getPointerOperandIndex()
StringRef - Represent a constant reference to a string, i.e.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
Triple - Helper class for working with autoconf configuration names.
bool isAMDGCN() const
Tests whether the target is AMDGCN.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
bool isArrayTy() const
True if this is an instance of ArrayType.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isPointerTy() const
True if this is an instance of PointerType.
bool isAggregateType() const
Return true if the type is an aggregate type.
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
A Use represents the edge between a Value definition and its users.
void setOperand(unsigned i, Value *Val)
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false) const
Implement operator<< on Value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
@ LOCAL_ADDRESS
Address space for local memory.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getDynamicVGPRBlockSize(const Function &F)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ C
The default llvm calling convention, compatible with C.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
initializer< Ty > init(const Ty &Val)
NodeAddr< PhiNode * > Phi
This is an optimization pass for GlobalISel generic memory operations.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
auto reverse(ContainerTy &&C)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
FunctionPass * createAMDGPUPromoteAlloca()
@ Mod
The access may modify the value stored in memory.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
char & AMDGPUPromoteAllocaID
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
This struct is a compact representation of a valid (non-zero power of two) alignment.
A MapVector that performs no allocations if smaller than a certain size.
Function object to check whether the second component of a container supported by std::get (like std:...