104#include "llvm/IR/IntrinsicsAMDGPU.h"
115#define DEBUG_TYPE "amdgpu-sw-lower-lds"
116#define COV5_HIDDEN_DYN_LDS_SIZE_ARG 15
119using namespace AMDGPU;
124 AsanInstrumentLDS(
"amdgpu-asan-instrument-lds",
125 cl::desc(
"Run asan instrumentation on LDS instructions "
126 "lowered to global memory"),
131struct LDSAccessTypeInfo {
139struct KernelLDSParameters {
143 LDSAccessTypeInfo DirectAccess;
144 LDSAccessTypeInfo IndirectAccess;
146 LDSToReplacementIndicesMap;
154struct NonKernelLDSParameters {
161struct AsanInstrumentInfo {
167struct FunctionsAndLDSAccess {
175class AMDGPUSwLowerLDS {
178 DomTreeCallback Callback)
179 : M(
Mod), AMDGPUTM(TM), IRB(M.getContext()), DTCallback(Callback) {}
181 void getUsesOfLDSByNonKernels();
182 void getNonKernelsWithLDSArguments(
const CallGraph &CG);
187 void buildSwLDSGlobal(
Function *Func);
188 void buildSwDynLDSGlobal(
Function *Func);
189 void populateSwMetadataGlobal(
Function *Func);
190 void populateSwLDSAttributeAndMetadata(
Function *Func);
191 void populateLDSToReplacementIndicesMap(
Function *Func);
192 void getLDSMemoryInstructions(
Function *Func,
194 void replaceKernelLDSAccesses(
Function *Func);
195 Value *getTranslatedGlobalMemoryGEPOfLDSPointer(
Value *LoadMallocPtr,
197 void translateLDSMemoryOperationsToGlobalMemory(
202 void buildNonKernelLDSOffsetTable(NonKernelLDSParameters &NKLDSParams);
203 void buildNonKernelLDSBaseTable(NonKernelLDSParameters &NKLDSParams);
205 getAddressesOfVariablesInKernel(
Function *Func,
207 void lowerNonKernelLDSAccesses(
Function *Func,
209 NonKernelLDSParameters &NKLDSParams);
211 updateMallocSizeForDynamicLDS(
Function *Func,
Value **CurrMallocSize,
212 Value *HiddenDynLDSSize,
220 DomTreeCallback DTCallback;
221 FunctionsAndLDSAccess FuncLDSAccessInfo;
222 AsanInstrumentInfo AsanInfo;
225template <
typename T>
SetVector<T> sortByName(std::vector<T> &&V) {
228 sort(V, [](
const auto *L,
const auto *R) {
229 return L->getName() < R->getName();
238 std::vector<GlobalVariable *>(Variables.
begin(), Variables.
end()));
246 if (Kernels.size() > UINT32_MAX) {
250 sortByName(std::vector<Function *>(Kernels.begin(), Kernels.end()));
251 for (
size_t i = 0; i < Kernels.size(); i++) {
256 Func->setMetadata(
"llvm.amdgcn.lds.kernel.id",
259 return OrderedKernels;
262void AMDGPUSwLowerLDS::getNonKernelsWithLDSArguments(
const CallGraph &CG) {
266 for (
auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) {
271 for (
auto &
I : *CGN) {
280 Type *ArgTy = (*AI).getType();
285 FuncLDSAccessInfo.NonKernelsWithLDSArgument.insert(CalledFunc);
288 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(Func);
294void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() {
295 for (
GlobalVariable *GV : FuncLDSAccessInfo.AllNonKernelLDSAccess) {
300 if (
auto *
I = dyn_cast<Instruction>(V)) {
302 if (!
isKernelLDS(
F) &&
F->hasFnAttribute(Attribute::SanitizeAddress) &&
304 FuncLDSAccessInfo.NonKernelToLDSAccessMap[
F].insert(GV);
318 ConstantInt::get(IntTy,
Address + 1));
319 GV->
setMetadata(LLVMContext::MD_absolute_symbol, MetadataNode);
330 Func->addFnAttr(
"amdgpu-lds-size", Buffer);
336 IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt());
339 Intrinsic::donothing, {});
341 Value *UseInstance[1] = {
348void AMDGPUSwLowerLDS::buildSwLDSGlobal(
Function *Func) {
351 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
359 LDSParams.SwLDS->setSanitizerMetadata(MD);
362void AMDGPUSwLowerLDS::buildSwDynLDSGlobal(
Function *Func) {
364 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
365 if (LDSParams.DirectAccess.DynamicLDSGlobals.empty() &&
366 LDSParams.IndirectAccess.DynamicLDSGlobals.empty())
369 auto *emptyCharArray = ArrayType::get(IRB.getInt8Ty(), 0);
372 "llvm.amdgcn." + Func->getName() +
".dynlds",
nullptr,
374 markUsedByKernel(Func, LDSParams.SwDynLDS);
377 LDSParams.SwDynLDS->setSanitizerMetadata(MD);
380void AMDGPUSwLowerLDS::populateSwLDSAttributeAndMetadata(
Function *Func) {
381 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
382 bool IsDynLDSUsed = LDSParams.SwDynLDS ?
true :
false;
384 recordLDSAbsoluteAddress(M, LDSParams.SwLDS, 0);
385 addLDSSizeAttribute(Func,
Offset, IsDynLDSUsed);
386 if (LDSParams.SwDynLDS)
387 recordLDSAbsoluteAddress(M, LDSParams.SwDynLDS,
Offset);
390void AMDGPUSwLowerLDS::populateSwMetadataGlobal(
Function *Func) {
393 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
394 auto &Ctx = M.getContext();
395 auto &
DL = M.getDataLayout();
396 std::vector<Type *> Items;
398 std::vector<Constant *> Initializers;
399 Align MaxAlignment(1);
402 MaxAlignment = std::max(MaxAlignment, GVAlign);
405 for (
GlobalVariable *GV : LDSParams.DirectAccess.StaticLDSGlobals)
406 UpdateMaxAlignment(GV);
408 for (
GlobalVariable *GV : LDSParams.DirectAccess.DynamicLDSGlobals)
409 UpdateMaxAlignment(GV);
411 for (
GlobalVariable *GV : LDSParams.IndirectAccess.StaticLDSGlobals)
412 UpdateMaxAlignment(GV);
414 for (
GlobalVariable *GV : LDSParams.IndirectAccess.DynamicLDSGlobals)
415 UpdateMaxAlignment(GV);
420 MDItemOS <<
"llvm.amdgcn.sw.lds." << Func->getName() <<
".md.item";
424 uint32_t &MallocSize = LDSParams.MallocSize;
426 int AsanScale = AsanInfo.Scale;
427 auto buildInitializerForSwLDSMD =
429 for (
auto &GV : LDSGlobals) {
432 UniqueLDSGlobals.
insert(GV);
435 const uint64_t SizeInBytes =
DL.getTypeAllocSize(Ty);
437 Constant *ItemStartOffset = ConstantInt::get(Int32Ty, MallocSize);
438 Constant *SizeInBytesConst = ConstantInt::get(Int32Ty, SizeInBytes);
443 MallocSize += SizeInBytes;
445 LDSParams.RedzoneOffsetAndSizeVector.emplace_back(MallocSize,
447 MallocSize += RightRedzoneSize;
450 alignTo(SizeInBytes + RightRedzoneSize, MaxAlignment);
452 ConstantInt::get(Int32Ty, AlignedSize);
454 MallocSize =
alignTo(MallocSize, MaxAlignment);
457 AlignedSizeInBytesConst});
458 Initializers.push_back(InitItem);
462 SwLDSVector.
insert(LDSParams.SwLDS);
463 buildInitializerForSwLDSMD(SwLDSVector);
464 buildInitializerForSwLDSMD(LDSParams.DirectAccess.StaticLDSGlobals);
465 buildInitializerForSwLDSMD(LDSParams.IndirectAccess.StaticLDSGlobals);
466 buildInitializerForSwLDSMD(LDSParams.DirectAccess.DynamicLDSGlobals);
467 buildInitializerForSwLDSMD(LDSParams.IndirectAccess.DynamicLDSGlobals);
470 Type *Ty = LDSParams.SwLDS->getValueType();
471 const uint64_t SizeInBytes =
DL.getTypeAllocSize(Ty);
473 LDSParams.LDSSize = AlignedSize;
476 MDTypeOS <<
"llvm.amdgcn.sw.lds." << Func->getName() <<
".md.type";
481 MDOS <<
"llvm.amdgcn.sw.lds." << Func->getName() <<
".md";
487 LDSParams.SwLDSMetadata->setInitializer(data);
490 LDSParams.SwLDS->setAlignment(MaxAlignment);
491 if (LDSParams.SwDynLDS)
492 LDSParams.SwDynLDS->setAlignment(MaxAlignment);
495 LDSParams.SwLDSMetadata->setSanitizerMetadata(MD);
498void AMDGPUSwLowerLDS::populateLDSToReplacementIndicesMap(
Function *Func) {
501 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
505 for (
auto &GV : LDSGlobals) {
508 UniqueLDSGlobals.
insert(GV);
509 LDSParams.LDSToReplacementIndicesMap[GV] = {0,
Idx, 0};
515 SwLDSVector.
insert(LDSParams.SwLDS);
516 PopulateIndices(SwLDSVector,
Idx);
517 PopulateIndices(LDSParams.DirectAccess.StaticLDSGlobals,
Idx);
518 PopulateIndices(LDSParams.IndirectAccess.StaticLDSGlobals,
Idx);
519 PopulateIndices(LDSParams.DirectAccess.DynamicLDSGlobals,
Idx);
520 PopulateIndices(LDSParams.IndirectAccess.DynamicLDSGlobals,
Idx);
524 Value *Replacement) {
526 auto ReplaceUsesLambda = [Func](
const Use &U) ->
bool {
527 auto *V = U.getUser();
528 if (
auto *Inst = dyn_cast<Instruction>(V)) {
529 auto *Func1 = Inst->getParent()->getParent();
538void AMDGPUSwLowerLDS::replaceKernelLDSAccesses(
Function *Func) {
539 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
547 auto &IndirectAccess = LDSParams.IndirectAccess;
548 auto &DirectAccess = LDSParams.DirectAccess;
552 for (
auto &GV : LDSGlobals) {
555 if ((IndirectAccess.StaticLDSGlobals.contains(GV) ||
556 IndirectAccess.DynamicLDSGlobals.contains(GV)) &&
557 (!DirectAccess.StaticLDSGlobals.contains(GV) &&
558 !DirectAccess.DynamicLDSGlobals.contains(GV)))
562 UniqueLDSGlobals.
insert(GV);
563 auto &Indices = LDSParams.LDSToReplacementIndicesMap[GV];
564 assert(Indices.size() == 3);
565 Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Indices[0]),
566 ConstantInt::get(Int32Ty, Indices[1]),
567 ConstantInt::get(Int32Ty, Indices[2])};
569 SwLDSMetadataStructType, SwLDSMetadata, GEPIdx,
true);
571 Value *BasePlusOffset =
572 IRB.CreateInBoundsGEP(IRB.getInt8Ty(), SwLDS, {Offset});
575 replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
578 ReplaceLDSGlobalUses(DirectAccess.StaticLDSGlobals);
579 ReplaceLDSGlobalUses(IndirectAccess.StaticLDSGlobals);
580 ReplaceLDSGlobalUses(DirectAccess.DynamicLDSGlobals);
581 ReplaceLDSGlobalUses(IndirectAccess.DynamicLDSGlobals);
584void AMDGPUSwLowerLDS::updateMallocSizeForDynamicLDS(
587 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
592 assert(SwLDS && SwLDSMetadata);
596 Value *MaxAlignValue = IRB.getInt32(MaxAlignment);
597 Value *MaxAlignValueMinusOne = IRB.getInt32(MaxAlignment - 1);
600 auto &Indices = LDSParams.LDSToReplacementIndicesMap[DynGV];
602 Constant *Index0 = ConstantInt::get(Int32Ty, 0);
603 Constant *Index1 = ConstantInt::get(Int32Ty, Indices[1]);
605 Constant *Index2Offset = ConstantInt::get(Int32Ty, 0);
606 auto *GEPForOffset = IRB.CreateInBoundsGEP(
607 MetadataStructType, SwLDSMetadata, {Index0, Index1, Index2Offset});
609 IRB.CreateStore(*CurrMallocSize, GEPForOffset);
611 Constant *Index2Size = ConstantInt::get(Int32Ty, 1);
612 auto *GEPForSize = IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
613 {Index0, Index1, Index2Size});
615 Value *CurrDynLDSSize = IRB.CreateLoad(Int32Ty, HiddenDynLDSSize);
616 IRB.CreateStore(CurrDynLDSSize, GEPForSize);
617 Constant *Index2AlignedSize = ConstantInt::get(Int32Ty, 2);
618 auto *GEPForAlignedSize = IRB.CreateInBoundsGEP(
619 MetadataStructType, SwLDSMetadata, {Index0, Index1, Index2AlignedSize});
621 Value *AlignedDynLDSSize =
622 IRB.CreateAdd(CurrDynLDSSize, MaxAlignValueMinusOne);
623 AlignedDynLDSSize = IRB.CreateUDiv(AlignedDynLDSSize, MaxAlignValue);
624 AlignedDynLDSSize = IRB.CreateMul(AlignedDynLDSSize, MaxAlignValue);
625 IRB.CreateStore(AlignedDynLDSSize, GEPForAlignedSize);
628 *CurrMallocSize = IRB.CreateAdd(*CurrMallocSize, AlignedDynLDSSize);
638 return DILocation::get(SP->
getContext(), SP->getLine(), 1, SP);
642void AMDGPUSwLowerLDS::getLDSMemoryInstructions(
646 if (
LoadInst *LI = dyn_cast<LoadInst>(&Inst)) {
648 LDSInstructions.
insert(&Inst);
649 }
else if (
StoreInst *SI = dyn_cast<StoreInst>(&Inst)) {
651 LDSInstructions.
insert(&Inst);
652 }
else if (
AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(&Inst)) {
654 LDSInstructions.
insert(&Inst);
657 LDSInstructions.
insert(&Inst);
665AMDGPUSwLowerLDS::getTranslatedGlobalMemoryGEPOfLDSPointer(
Value *LoadMallocPtr,
667 assert(LDSPtr &&
"Invalid LDS pointer operand");
668 Value *PtrToInt = IRB.CreatePtrToInt(LDSPtr, IRB.getInt32Ty());
670 IRB.CreateInBoundsGEP(IRB.getInt8Ty(), LoadMallocPtr, {PtrToInt});
674void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory(
677 LLVM_DEBUG(
dbgs() <<
"Translating LDS memory operations to global memory : "
680 IRB.SetInsertPoint(Inst);
681 if (
LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
682 Value *LIOperand = LI->getPointerOperand();
684 getTranslatedGlobalMemoryGEPOfLDSPointer(LoadMallocPtr, LIOperand);
685 LoadInst *NewLI = IRB.CreateAlignedLoad(LI->getType(), Replacement,
686 LI->getAlign(), LI->isVolatile());
687 NewLI->
setAtomic(LI->getOrdering(), LI->getSyncScopeID());
688 AsanInfo.Instructions.insert(NewLI);
689 LI->replaceAllUsesWith(NewLI);
690 LI->eraseFromParent();
691 }
else if (
StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
692 Value *SIOperand = SI->getPointerOperand();
694 getTranslatedGlobalMemoryGEPOfLDSPointer(LoadMallocPtr, SIOperand);
695 StoreInst *NewSI = IRB.CreateAlignedStore(
696 SI->getValueOperand(), Replacement, SI->getAlign(), SI->isVolatile());
697 NewSI->
setAtomic(SI->getOrdering(), SI->getSyncScopeID());
698 AsanInfo.Instructions.insert(NewSI);
699 SI->replaceAllUsesWith(NewSI);
700 SI->eraseFromParent();
701 }
else if (
AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
702 Value *RMWPtrOperand = RMW->getPointerOperand();
703 Value *RMWValOperand = RMW->getValOperand();
704 Value *Replacement = getTranslatedGlobalMemoryGEPOfLDSPointer(
705 LoadMallocPtr, RMWPtrOperand);
707 RMW->getOperation(), Replacement, RMWValOperand, RMW->getAlign(),
708 RMW->getOrdering(), RMW->getSyncScopeID());
710 AsanInfo.Instructions.insert(NewRMW);
711 RMW->replaceAllUsesWith(NewRMW);
712 RMW->eraseFromParent();
714 Value *XCHGPtrOperand = XCHG->getPointerOperand();
715 Value *Replacement = getTranslatedGlobalMemoryGEPOfLDSPointer(
716 LoadMallocPtr, XCHGPtrOperand);
718 Replacement, XCHG->getCompareOperand(), XCHG->getNewValOperand(),
719 XCHG->getAlign(), XCHG->getSuccessOrdering(),
720 XCHG->getFailureOrdering(), XCHG->getSyncScopeID());
722 AsanInfo.Instructions.insert(NewXCHG);
723 XCHG->replaceAllUsesWith(NewXCHG);
724 XCHG->eraseFromParent();
730void AMDGPUSwLowerLDS::poisonRedzones(
Function *Func,
Value *MallocPtr) {
731 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
735 "__asan_poison_region",
736 FunctionType::get(VoidTy, {Int64Ty, Int64Ty},
false));
738 auto RedzonesVec = LDSParams.RedzoneOffsetAndSizeVector;
739 size_t VecSize = RedzonesVec.size();
740 for (
unsigned i = 0; i < VecSize; i++) {
741 auto &RedzonePair = RedzonesVec[i];
742 uint64_t RedzoneOffset = RedzonePair.first;
743 uint64_t RedzoneSize = RedzonePair.second;
744 Value *RedzoneAddrOffset = IRB.CreateInBoundsGEP(
745 IRB.getInt8Ty(), MallocPtr, {IRB.getInt64(RedzoneOffset)});
746 Value *RedzoneAddress = IRB.CreatePtrToInt(RedzoneAddrOffset, Int64Ty);
747 IRB.CreateCall(AsanPoisonRegion,
748 {RedzoneAddress, IRB.getInt64(RedzoneSize)});
752void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(
Function *Func,
754 LLVM_DEBUG(
dbgs() <<
"Sw Lowering Kernel LDS for : " << Func->getName());
755 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
756 auto &Ctx = M.getContext();
757 auto *PrevEntryBlock = &Func->getEntryBlock();
759 getLDSMemoryInstructions(Func, LDSInstructions);
767 IRB.SetInsertPoint(WIdBlock, WIdBlock->begin());
770 IRB.SetCurrentDebugLocation(FirstDL);
771 Value *WIdx = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {}, {});
772 Value *WIdy = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_y, {}, {});
773 Value *WIdz = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_z, {}, {});
774 Value *XYOr = IRB.CreateOr(WIdx, WIdy);
775 Value *XYZOr = IRB.CreateOr(XYOr, WIdz);
776 Value *WIdzCond = IRB.CreateICmpEQ(XYZOr, IRB.getInt32(0));
780 IRB.CreateCondBr(WIdzCond, MallocBlock, PrevEntryBlock);
783 IRB.SetInsertPoint(MallocBlock, MallocBlock->begin());
790 assert(SwLDS && SwLDSMetadata);
794 Value *CurrMallocSize;
800 for (
auto &GV : LDSGlobals) {
803 UniqueLDSGlobals.
insert(GV);
807 GetUniqueLDSGlobals(LDSParams.DirectAccess.StaticLDSGlobals);
808 GetUniqueLDSGlobals(LDSParams.IndirectAccess.StaticLDSGlobals);
809 unsigned NumStaticLDS = 1 + UniqueLDSGlobals.
size();
810 UniqueLDSGlobals.
clear();
813 auto *GEPForEndStaticLDSOffset =
814 IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
815 {ConstantInt::get(Int32Ty, 0),
816 ConstantInt::get(Int32Ty, NumStaticLDS - 1),
817 ConstantInt::get(Int32Ty, 0)});
819 auto *GEPForEndStaticLDSSize =
820 IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
821 {ConstantInt::get(Int32Ty, 0),
822 ConstantInt::get(Int32Ty, NumStaticLDS - 1),
823 ConstantInt::get(Int32Ty, 2)});
825 Value *EndStaticLDSOffset =
826 IRB.CreateLoad(Int32Ty, GEPForEndStaticLDSOffset);
827 Value *EndStaticLDSSize = IRB.CreateLoad(Int32Ty, GEPForEndStaticLDSSize);
828 CurrMallocSize = IRB.CreateAdd(EndStaticLDSOffset, EndStaticLDSSize);
830 CurrMallocSize = IRB.getInt32(MallocSize);
832 if (LDSParams.SwDynLDS) {
835 "Dynamic LDS size query is only supported for CO V5 and later.");
838 IRB.CreateIntrinsic(Intrinsic::amdgcn_implicitarg_ptr, {}, {});
839 Value *HiddenDynLDSSize = IRB.CreateInBoundsGEP(
840 ImplicitArg->
getType(), ImplicitArg,
842 UniqueLDSGlobals.
clear();
843 GetUniqueLDSGlobals(LDSParams.DirectAccess.DynamicLDSGlobals);
844 GetUniqueLDSGlobals(LDSParams.IndirectAccess.DynamicLDSGlobals);
845 updateMallocSizeForDynamicLDS(Func, &CurrMallocSize, HiddenDynLDSSize,
849 CurrMallocSize = IRB.CreateZExt(CurrMallocSize, Int64Ty);
853 Value *ReturnAddress =
854 IRB.CreateIntrinsic(Intrinsic::returnaddress, {}, {IRB.getInt32(0)});
857 FunctionType::get(Int64Ty, {Int64Ty, Int64Ty},
false));
858 Value *RAPtrToInt = IRB.CreatePtrToInt(ReturnAddress, Int64Ty);
859 Value *MallocCall = IRB.CreateCall(MallocFunc, {CurrMallocSize, RAPtrToInt});
865 IRB.CreateStore(MallocPtr, SwLDS);
868 poisonRedzones(Func, MallocPtr);
871 IRB.CreateBr(PrevEntryBlock);
875 IRB.SetInsertPoint(PrevEntryBlock, PrevEntryBlock->begin());
876 auto *XYZCondPhi = IRB.CreatePHI(Int1Ty, 2,
"xyzCond");
877 XYZCondPhi->addIncoming(IRB.getInt1(0), WIdBlock);
878 XYZCondPhi->addIncoming(IRB.getInt1(1), MallocBlock);
880 IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}, {});
883 Value *LoadMallocPtr =
887 replaceKernelLDSAccesses(Func);
891 translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr,
899 if (
ReturnInst *RI = dyn_cast<ReturnInst>(&BB.back())) {
900 RI->eraseFromParent();
901 IRB.SetInsertPoint(&BB, BB.end());
902 IRB.CreateBr(CondFreeBlock);
908 IRB.SetInsertPoint(CondFreeBlock, CondFreeBlock->begin());
909 IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}, {});
910 IRB.CreateCondBr(XYZCondPhi, FreeBlock, EndBlock);
913 IRB.SetInsertPoint(FreeBlock, FreeBlock->begin());
918 FunctionType::get(IRB.getVoidTy(), {Int64Ty, Int64Ty},
false));
920 IRB.CreateIntrinsic(Intrinsic::returnaddress, {}, IRB.getInt32(0));
921 Value *RAPToInt = IRB.CreatePtrToInt(ReturnAddr, Int64Ty);
922 Value *MallocPtrToInt = IRB.CreatePtrToInt(LoadMallocPtr, Int64Ty);
923 IRB.CreateCall(AsanFreeFunc, {MallocPtrToInt, RAPToInt});
925 IRB.CreateBr(EndBlock);
928 IRB.SetInsertPoint(EndBlock, EndBlock->begin());
931 DTU.
applyUpdates({{DominatorTree::Insert, WIdBlock, MallocBlock},
932 {DominatorTree::Insert, MallocBlock, PrevEntryBlock},
933 {DominatorTree::Insert, CondFreeBlock, FreeBlock},
934 {DominatorTree::Insert, FreeBlock, EndBlock}});
937Constant *AMDGPUSwLowerLDS::getAddressesOfVariablesInKernel(
940 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
944 auto *SwLDSMetadataStructType =
950 for (
auto *GV : Variables) {
951 if (!LDSParams.LDSToReplacementIndicesMap.contains(GV)) {
956 auto &Indices = LDSParams.LDSToReplacementIndicesMap[GV];
957 Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Indices[0]),
958 ConstantInt::get(Int32Ty, Indices[1]),
959 ConstantInt::get(Int32Ty, Indices[2])};
961 SwLDSMetadata, GEPIdx,
true);
962 Elements.push_back(
GEP);
967void AMDGPUSwLowerLDS::buildNonKernelLDSBaseTable(
968 NonKernelLDSParameters &NKLDSParams) {
972 auto &Kernels = NKLDSParams.OrderedKernels;
976 const size_t NumberKernels = Kernels.size();
979 std::vector<Constant *> OverallConstantExprElts(NumberKernels);
980 for (
size_t i = 0; i < NumberKernels; i++) {
982 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
985 Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, 0)};
988 OverallConstantExprElts[i] =
GEP;
998 NKLDSParams.LDSBaseTable->setSanitizerMetadata(MD);
1001void AMDGPUSwLowerLDS::buildNonKernelLDSOffsetTable(
1002 NonKernelLDSParameters &NKLDSParams) {
1010 auto &Variables = NKLDSParams.OrdereLDSGlobals;
1011 auto &Kernels = NKLDSParams.OrderedKernels;
1012 if (Variables.
empty() || Kernels.empty())
1014 const size_t NumberVariables = Variables.
size();
1015 const size_t NumberKernels = Kernels.size();
1021 ArrayType::get(KernelOffsetsType, NumberKernels);
1022 std::vector<Constant *> overallConstantExprElts(NumberKernels);
1023 for (
size_t i = 0; i < NumberKernels; i++) {
1025 overallConstantExprElts[i] =
1026 getAddressesOfVariablesInKernel(Func, Variables);
1036 NKLDSParams.LDSOffsetTable->setSanitizerMetadata(MD);
1039void AMDGPUSwLowerLDS::lowerNonKernelLDSAccesses(
1041 NonKernelLDSParameters &NKLDSParams) {
1044 LLVM_DEBUG(
dbgs() <<
"Sw LDS lowering, lower non-kernel access for : "
1045 << Func->getName());
1046 auto InsertAt = Func->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
1047 IRB.SetInsertPoint(InsertAt);
1051 getLDSMemoryInstructions(Func, LDSInstructions);
1053 auto *KernelId = IRB.CreateIntrinsic(Intrinsic::amdgcn_lds_kernel_id, {}, {});
1056 auto &OrdereLDSGlobals = NKLDSParams.OrdereLDSGlobals;
1057 Value *BaseGEP = IRB.CreateInBoundsGEP(
1058 LDSBaseTable->
getValueType(), LDSBaseTable, {IRB.getInt32(0), KernelId});
1061 Value *LoadMallocPtr =
1066 std::find(OrdereLDSGlobals.begin(), OrdereLDSGlobals.end(), GV);
1067 assert(GVIt != OrdereLDSGlobals.end());
1068 uint32_t GVOffset = std::distance(OrdereLDSGlobals.begin(), GVIt);
1070 Value *OffsetGEP = IRB.CreateInBoundsGEP(
1072 {IRB.getInt32(0), KernelId, IRB.getInt32(GVOffset)});
1075 Value *
Offset = IRB.CreateLoad(IRB.getInt32Ty(), OffsetLoad);
1076 Value *BasePlusOffset =
1077 IRB.CreateInBoundsGEP(IRB.getInt8Ty(), BaseLoad, {Offset});
1078 LLVM_DEBUG(
dbgs() <<
"Sw LDS Lowering, Replace non-kernel LDS for "
1080 replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
1082 translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr,
1086static void reorderStaticDynamicIndirectLDSSet(KernelLDSParameters &LDSParams) {
1089 auto &DirectAccess = LDSParams.DirectAccess;
1090 auto &IndirectAccess = LDSParams.IndirectAccess;
1091 LDSParams.DirectAccess.StaticLDSGlobals = sortByName(
1092 std::vector<GlobalVariable *>(DirectAccess.StaticLDSGlobals.begin(),
1093 DirectAccess.StaticLDSGlobals.end()));
1094 LDSParams.DirectAccess.DynamicLDSGlobals = sortByName(
1095 std::vector<GlobalVariable *>(DirectAccess.DynamicLDSGlobals.begin(),
1096 DirectAccess.DynamicLDSGlobals.end()));
1097 LDSParams.IndirectAccess.StaticLDSGlobals = sortByName(
1098 std::vector<GlobalVariable *>(IndirectAccess.StaticLDSGlobals.begin(),
1099 IndirectAccess.StaticLDSGlobals.end()));
1100 LDSParams.IndirectAccess.DynamicLDSGlobals = sortByName(
1101 std::vector<GlobalVariable *>(IndirectAccess.DynamicLDSGlobals.begin(),
1102 IndirectAccess.DynamicLDSGlobals.end()));
1105void AMDGPUSwLowerLDS::initAsanInfo() {
1111 bool OrShadowOffset;
1113 false, &
Offset, &Scale, &OrShadowOffset);
1114 AsanInfo.Scale = Scale;
1115 AsanInfo.Offset =
Offset;
1118bool AMDGPUSwLowerLDS::run() {
1119 bool Changed =
false;
1130 bool DirectAccess) {
1131 for (
auto &K : LDSAccesses) {
1133 if (!
F || K.second.empty())
1137 if (!
F->hasFnAttribute(Attribute::SanitizeAddress))
1141 FuncLDSAccessInfo.KernelToLDSParametersMap.insert(
1142 {
F, KernelLDSParameters()});
1144 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[
F];
1146 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(
F);
1148 if (!DirectAccess) {
1150 LDSParams.IndirectAccess.DynamicLDSGlobals.insert(GV);
1152 LDSParams.IndirectAccess.StaticLDSGlobals.insert(GV);
1153 FuncLDSAccessInfo.AllNonKernelLDSAccess.insert(GV);
1156 LDSParams.DirectAccess.DynamicLDSGlobals.insert(GV);
1158 LDSParams.DirectAccess.StaticLDSGlobals.insert(GV);
1164 PopulateKernelStaticDynamicLDS(LDSUsesInfo.
direct_access,
true);
1170 for (
auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) {
1172 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
1173 if (LDSParams.DirectAccess.StaticLDSGlobals.empty() &&
1174 LDSParams.DirectAccess.DynamicLDSGlobals.empty() &&
1175 LDSParams.IndirectAccess.StaticLDSGlobals.empty() &&
1176 LDSParams.IndirectAccess.DynamicLDSGlobals.empty()) {
1180 {
"amdgpu-no-workitem-id-x",
1181 "amdgpu-no-workitem-id-y",
1182 "amdgpu-no-workitem-id-z"});
1183 reorderStaticDynamicIndirectLDSSet(LDSParams);
1184 buildSwLDSGlobal(Func);
1185 buildSwDynLDSGlobal(Func);
1186 populateSwMetadataGlobal(Func);
1187 populateSwLDSAttributeAndMetadata(Func);
1188 populateLDSToReplacementIndicesMap(Func);
1190 DomTreeUpdater::UpdateStrategy::Lazy);
1191 lowerKernelLDSAccesses(Func, DTU);
1197 getUsesOfLDSByNonKernels();
1200 getNonKernelsWithLDSArguments(CG);
1202 if (!FuncLDSAccessInfo.NonKernelToLDSAccessMap.empty() ||
1203 !FuncLDSAccessInfo.NonKernelsWithLDSArgument.empty()) {
1204 NonKernelLDSParameters NKLDSParams;
1205 NKLDSParams.OrderedKernels = getOrderedIndirectLDSAccessingKernels(
1206 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess);
1207 NKLDSParams.OrdereLDSGlobals = getOrderedNonKernelAllLDSGlobals(
1208 FuncLDSAccessInfo.AllNonKernelLDSAccess);
1209 buildNonKernelLDSBaseTable(NKLDSParams);
1210 buildNonKernelLDSOffsetTable(NKLDSParams);
1211 for (
auto &K : FuncLDSAccessInfo.NonKernelToLDSAccessMap) {
1215 std::vector<GlobalVariable *>(LDSGlobals.
begin(), LDSGlobals.
end()));
1216 lowerNonKernelLDSAccesses(Func, OrderedLDSGlobals, NKLDSParams);
1218 for (
Function *Func : FuncLDSAccessInfo.NonKernelsWithLDSArgument) {
1219 auto &K = FuncLDSAccessInfo.NonKernelToLDSAccessMap;
1220 if (K.find(Func) != K.end())
1223 lowerNonKernelLDSAccesses(Func, Vec, NKLDSParams);
1240 if (AsanInstrumentLDS) {
1245 for (
auto &Operand : InterestingOperands) {
1246 OperandsToInstrument.
push_back(Operand);
1249 for (
auto &Operand : OperandsToInstrument) {
1252 Operand.Alignment.valueOrOne(), Operand.TypeStoreSize,
1253 Operand.IsWrite,
nullptr,
false,
false, AsanInfo.Scale,
1262class AMDGPUSwLowerLDSLegacy :
public ModulePass {
1277char AMDGPUSwLowerLDSLegacy::ID = 0;
1281 "AMDGPU Software lowering of LDS",
false,
false)
1286bool AMDGPUSwLowerLDSLegacy::runOnModule(
Module &M) {
1289 if (!M.getModuleFlag(
"nosanitize_address"))
1292 getAnalysisIfAvailable<DominatorTreeWrapperPass>();
1297 auto &TPC = getAnalysis<TargetPassConfig>();
1300 AMDGPUSwLowerLDS SwLowerLDSImpl(M, *AMDGPUTM, DTCallback);
1301 bool IsChanged = SwLowerLDSImpl.run();
1307 return new AMDGPUSwLowerLDSLegacy(TM);
1314 if (!M.getModuleFlag(
"nosanitize_address"))
1320 AMDGPUSwLowerLDS SwLowerLDSImpl(M,
TM, DTCallback);
1321 bool IsChanged = SwLowerLDSImpl.run();
amdgpu sw lower AMDGPU Software lowering of LDS
#define COV5_HIDDEN_DYN_LDS_SIZE_ARG
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
pre isel intrinsic lowering
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file implements a set that has insertion order iteration characteristics.
Target-Independent Code Generator Pass Configuration Options pass.
static DebugLoc getOrCreateDebugLoc(const Instruction *InsertBefore, DISubprogram *SP)
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
An instruction that atomically checks whether a specified value is in a memory location,...
void setVolatile(bool V)
Specify whether this is a volatile cmpxchg.
an instruction that atomically reads a memory location, combines it with another value,...
void setVolatile(bool V)
Specify whether this is a volatile RMW or not.
LLVM Basic Block Representation.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
A node in the call graph for a module.
Function * getFunction() const
Returns the function that this call graph node represents.
The basic data container for the call graph of a Module of IR.
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static Constant * getGetElementPtr(Type *Ty, Constant *C, ArrayRef< Constant * > IdxList, GEPNoWrapFlags NW=GEPNoWrapFlags::none(), std::optional< ConstantRange > InRange=std::nullopt, Type *OnlyIfReducedTy=nullptr)
Getelementptr form.
static Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
void removeDeadConstantUsers() const
If there are any dead constant users dangling off of this constant, remove them.
Implements a dense probed hash-table based set.
Analysis pass which computes a DominatorTree.
Legacy analysis pass which computes a DominatorTree.
DominatorTree & getDomTree()
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
void applyUpdates(ArrayRef< UpdateT > Updates)
Submit updates to all available trees.
void setMetadata(unsigned KindID, MDNode *Node)
Set a particular kind of metadata attachment.
uint64_t getAlignment() const
FIXME: Remove this function once transition to Align is over.
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
PointerType * getType() const
Global values are always pointers.
@ InternalLinkage
Rename collisions when linking (static functions).
@ ExternalLinkage
Externally visible function.
Type * getValueType() const
void eraseFromParent()
eraseFromParent - This method unlinks 'this' from the containing module and deletes it.
Value * CreateConstInBoundsGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
LLVMContext & getContext() const
void push_back(MachineInstr *MI)
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
virtual bool runOnModule(Module &M)=0
runOnModule - Virtual method overriden by subclasses to process the module being operated on.
A Module instance is used to store all the information related to an LLVM module.
A container for an operand bundle being viewed as a set of values rather than a set of uses.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserve()
Mark an analysis as preserved.
Return a value (possibly void), from a function.
A vector that has set insertion semantics.
size_type size() const
Determine the number of elements in the SetVector.
iterator end()
Get an iterator to the end of the SetVector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
iterator begin()
Get an iterator to the beginning of the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringRef - Represent a constant reference to a string, i.e.
Class to represent struct types.
static StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Target-Independent Code Generator Pass Configuration Options.
Triple - Helper class for working with autoconf configuration names.
The instances of the Type class are immutable: once they are created, they are never changed.
bool isPointerTy() const
True if this is an instance of PointerType.
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static Type * getVoidTy(LLVMContext &C)
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
A Use represents the edge between a Value definition and its users.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
iterator_range< user_iterator > users()
void printAsOperand(raw_ostream &O, bool PrintType=true, const Module *M=nullptr) const
Print the name of this Value out to the specified raw_ostream.
void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
StringRef getName() const
Return a constant reference to the value's name.
An efficient, type-erasing, non-owning reference to a callable.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
StringRef str() const
Return a StringRef for the vector contents.
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
void getInterestingMemoryOperands(Module &M, Instruction *I, SmallVectorImpl< InterestingMemoryOperand > &Interesting)
Get all the memory operands from the instruction that needs to be instrumented.
bool isDynamicLDS(const GlobalVariable &GV)
unsigned getAMDHSACodeObjectVersion(const Module &M)
void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot, ArrayRef< StringRef > FnAttrs)
Strip FnAttr attribute from any functions where we may have introduced its use.
LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M)
bool isLDSVariableToLower(const GlobalVariable &GV)
bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M)
Align getAlign(const DataLayout &DL, const GlobalVariable *GV)
bool isKernelLDS(const Function *F)
void instrumentAddress(Module &M, IRBuilder<> &IRB, Instruction *OrigIns, Instruction *InsertBefore, Value *Addr, Align Alignment, TypeSize TypeStoreSize, bool IsWrite, Value *SizeArgument, bool UseCalls, bool Recover, int AsanScale, int AsanOffset)
Instrument the memory operand Addr.
uint64_t getRedzoneSizeForGlobal(int AsanScale, uint64_t SizeInBytes)
Given SizeInBytes of the Value to be instrunmented, Returns the redzone size corresponding to it.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
ModulePass * createAMDGPUSwLowerLDSLegacyPass(const AMDGPUTargetMachine *TM=nullptr)
void initializeAMDGPUSwLowerLDSLegacyPass(PassRegistry &)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
char & AMDGPUSwLowerLDSLegacyPassID
void sort(IteratorTy Start, IteratorTy End)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
void getAddressSanitizerParams(const Triple &TargetTriple, int LongSize, bool IsKasan, uint64_t *ShadowBase, int *MappingScale, bool *OrShadowOffset)
const AMDGPUTargetMachine & TM
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
FunctionVariableMap direct_access
FunctionVariableMap indirect_access
This struct is a compact representation of a valid (non-zero power of two) alignment.