49#include "llvm/IR/IntrinsicsAMDGPU.h"
50#include "llvm/IR/IntrinsicsNVPTX.h"
66#define DEBUG_TYPE "openmp-opt"
69 "openmp-opt-disable",
cl::desc(
"Disable OpenMP specific optimizations."),
73 "openmp-opt-enable-merging",
79 cl::desc(
"Disable function internalization."),
90 "openmp-hide-memory-transfer-latency",
91 cl::desc(
"[WIP] Tries to hide the latency of host to device memory"
96 "openmp-opt-disable-deglobalization",
97 cl::desc(
"Disable OpenMP optimizations involving deglobalization."),
101 "openmp-opt-disable-spmdization",
102 cl::desc(
"Disable OpenMP optimizations involving SPMD-ization."),
106 "openmp-opt-disable-folding",
111 "openmp-opt-disable-state-machine-rewrite",
112 cl::desc(
"Disable OpenMP optimizations that replace the state machine."),
116 "openmp-opt-disable-barrier-elimination",
117 cl::desc(
"Disable OpenMP optimizations that eliminate barriers."),
121 "openmp-opt-print-module-after",
122 cl::desc(
"Print the current module after OpenMP optimizations."),
126 "openmp-opt-print-module-before",
127 cl::desc(
"Print the current module before OpenMP optimizations."),
131 "openmp-opt-inline-device",
142 cl::desc(
"Maximal number of attributor iterations."),
147 cl::desc(
"Maximum amount of shared memory to use."),
148 cl::init(std::numeric_limits<unsigned>::max()));
151 "Number of OpenMP runtime calls deduplicated");
153 "Number of OpenMP parallel regions deleted");
155 "Number of OpenMP runtime functions identified");
157 "Number of OpenMP runtime function uses identified");
159 "Number of OpenMP target region entry points (=kernels) identified");
161 "Number of non-OpenMP target region kernels identified");
163 "Number of OpenMP target region entry points (=kernels) executed in "
164 "SPMD-mode instead of generic-mode");
165STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine,
166 "Number of OpenMP target region entry points (=kernels) executed in "
167 "generic-mode without a state machines");
168STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback,
169 "Number of OpenMP target region entry points (=kernels) executed in "
170 "generic-mode with customized state machines with fallback");
171STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback,
172 "Number of OpenMP target region entry points (=kernels) executed in "
173 "generic-mode with customized state machines without fallback");
175 NumOpenMPParallelRegionsReplacedInGPUStateMachine,
176 "Number of OpenMP parallel regions replaced with ID in GPU state machines");
178 "Number of OpenMP parallel regions merged");
180 "Amount of memory pushed to shared memory");
181STATISTIC(NumBarriersEliminated,
"Number of redundant barriers eliminated");
209#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX) \
210 constexpr const unsigned MEMBER##Idx = IDX;
215#undef KERNEL_ENVIRONMENT_IDX
217#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX) \
218 constexpr const unsigned MEMBER##Idx = IDX;
228#undef KERNEL_ENVIRONMENT_CONFIGURATION_IDX
230#define KERNEL_ENVIRONMENT_GETTER(MEMBER, RETURNTYPE) \
231 RETURNTYPE *get##MEMBER##FromKernelEnvironment(ConstantStruct *KernelEnvC) { \
232 return cast<RETURNTYPE>(KernelEnvC->getAggregateElement(MEMBER##Idx)); \
238#undef KERNEL_ENVIRONMENT_GETTER
240#define KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MEMBER) \
241 ConstantInt *get##MEMBER##FromKernelEnvironment( \
242 ConstantStruct *KernelEnvC) { \
243 ConstantStruct *ConfigC = \
244 getConfigurationFromKernelEnvironment(KernelEnvC); \
245 return dyn_cast<ConstantInt>(ConfigC->getAggregateElement(MEMBER##Idx)); \
256#undef KERNEL_ENVIRONMENT_CONFIGURATION_GETTER
260 constexpr const int InitKernelEnvironmentArgNo = 0;
261 return cast<GlobalVariable>(
275struct AAHeapToShared;
286 OpenMPPostLink(OpenMPPostLink) {
289 OMPBuilder.initialize();
290 initializeRuntimeFunctions(M);
291 initializeInternalControlVars();
295 struct InternalControlVarInfo {
322 struct RuntimeFunctionInfo {
346 void clearUsesMap() { UsesMap.
clear(); }
349 operator bool()
const {
return Declaration; }
352 UseVector &getOrCreateUseVector(
Function *
F) {
353 std::shared_ptr<UseVector> &UV = UsesMap[
F];
355 UV = std::make_shared<UseVector>();
361 const UseVector *getUseVector(
Function &
F)
const {
362 auto I = UsesMap.find(&
F);
363 if (
I != UsesMap.end())
364 return I->second.get();
369 size_t getNumFunctionsWithUses()
const {
return UsesMap.size(); }
373 size_t getNumArgs()
const {
return ArgumentTypes.
size(); }
391 UseVector &UV = getOrCreateUseVector(
F);
401 while (!ToBeDeleted.
empty()) {
415 decltype(UsesMap)::iterator
begin() {
return UsesMap.
begin(); }
416 decltype(UsesMap)::iterator
end() {
return UsesMap.
end(); }
424 RuntimeFunction::OMPRTL___last>
432 InternalControlVar::ICV___last>
437 void initializeInternalControlVars() {
438#define ICV_RT_SET(_Name, RTL) \
440 auto &ICV = ICVs[_Name]; \
443#define ICV_RT_GET(Name, RTL) \
445 auto &ICV = ICVs[Name]; \
448#define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \
450 auto &ICV = ICVs[Enum]; \
453 ICV.InitKind = Init; \
454 ICV.EnvVarName = _EnvVarName; \
455 switch (ICV.InitKind) { \
456 case ICV_IMPLEMENTATION_DEFINED: \
457 ICV.InitValue = nullptr; \
460 ICV.InitValue = ConstantInt::get( \
461 Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \
464 ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \
470#include "llvm/Frontend/OpenMP/OMPKinds.def"
476 static bool declMatchesRTFTypes(
Function *
F,
Type *RTFRetType,
483 if (
F->getReturnType() != RTFRetType)
485 if (
F->arg_size() != RTFArgTypes.
size())
488 auto *RTFTyIt = RTFArgTypes.
begin();
490 if (Arg.getType() != *RTFTyIt)
500 unsigned collectUses(RuntimeFunctionInfo &RFI,
bool CollectStats =
true) {
501 unsigned NumUses = 0;
502 if (!RFI.Declaration)
507 NumOpenMPRuntimeFunctionsIdentified += 1;
508 NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
512 for (
Use &U : RFI.Declaration->uses()) {
513 if (
Instruction *UserI = dyn_cast<Instruction>(
U.getUser())) {
514 if (!
CGSCC ||
CGSCC->empty() ||
CGSCC->contains(UserI->getFunction())) {
515 RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
519 RFI.getOrCreateUseVector(
nullptr).push_back(&U);
528 auto &RFI = RFIs[RTF];
530 collectUses(RFI,
false);
534 void recollectUses() {
535 for (
int Idx = 0;
Idx < RFIs.size(); ++
Idx)
555 RuntimeFunctionInfo &RFI = RFIs[Fn];
557 if (RFI.Declaration && RFI.Declaration->isDeclaration())
565 void initializeRuntimeFunctions(
Module &M) {
568#define OMP_TYPE(VarName, ...) \
569 Type *VarName = OMPBuilder.VarName; \
572#define OMP_ARRAY_TYPE(VarName, ...) \
573 ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \
575 PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \
576 (void)VarName##PtrTy;
578#define OMP_FUNCTION_TYPE(VarName, ...) \
579 FunctionType *VarName = OMPBuilder.VarName; \
581 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
584#define OMP_STRUCT_TYPE(VarName, ...) \
585 StructType *VarName = OMPBuilder.VarName; \
587 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
590#define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \
592 SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \
593 Function *F = M.getFunction(_Name); \
594 RTLFunctions.insert(F); \
595 if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \
596 RuntimeFunctionIDMap[F] = _Enum; \
597 auto &RFI = RFIs[_Enum]; \
600 RFI.IsVarArg = _IsVarArg; \
601 RFI.ReturnType = OMPBuilder._ReturnType; \
602 RFI.ArgumentTypes = std::move(ArgsTypes); \
603 RFI.Declaration = F; \
604 unsigned NumUses = collectUses(RFI); \
607 dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \
609 if (RFI.Declaration) \
610 dbgs() << TAG << "-> got " << NumUses << " uses in " \
611 << RFI.getNumFunctionsWithUses() \
612 << " different functions.\n"; \
616#include "llvm/Frontend/OpenMP/OMPKinds.def"
622 for (
StringRef Prefix : {
"__kmpc",
"_ZN4ompx",
"omp_"})
623 if (
F.hasFnAttribute(Attribute::NoInline) &&
624 F.getName().starts_with(Prefix) &&
625 !
F.hasFnAttribute(Attribute::OptimizeNone))
626 F.removeFnAttr(Attribute::NoInline);
637 bool OpenMPPostLink =
false;
640template <
typename Ty,
bool InsertInval
idates = true>
642 bool contains(
const Ty &Elem)
const {
return Set.contains(Elem); }
643 bool insert(
const Ty &Elem) {
644 if (InsertInvalidates)
646 return Set.insert(Elem);
649 const Ty &operator[](
int Idx)
const {
return Set[
Idx]; }
650 bool operator==(
const BooleanStateWithSetVector &RHS)
const {
651 return BooleanState::operator==(RHS) &&
Set ==
RHS.Set;
653 bool operator!=(
const BooleanStateWithSetVector &RHS)
const {
654 return !(*
this ==
RHS);
657 bool empty()
const {
return Set.empty(); }
658 size_t size()
const {
return Set.size(); }
661 BooleanStateWithSetVector &
operator^=(
const BooleanStateWithSetVector &RHS) {
662 BooleanState::operator^=(RHS);
663 Set.insert(
RHS.Set.begin(),
RHS.Set.end());
672 typename decltype(
Set)::iterator
begin() {
return Set.begin(); }
673 typename decltype(
Set)::iterator
end() {
return Set.end(); }
678template <
typename Ty,
bool InsertInval
idates = true>
679using BooleanStateWithPtrSetVector =
680 BooleanStateWithSetVector<Ty *, InsertInvalidates>;
684 bool IsAtFixpoint =
false;
688 BooleanStateWithPtrSetVector<
CallBase,
false>
689 ReachedKnownParallelRegions;
692 BooleanStateWithPtrSetVector<CallBase> ReachedUnknownParallelRegions;
697 BooleanStateWithPtrSetVector<Instruction, false> SPMDCompatibilityTracker;
712 bool IsKernelEntry =
false;
715 BooleanStateWithPtrSetVector<Function, false> ReachingKernelEntries;
720 BooleanStateWithSetVector<uint8_t> ParallelLevels;
723 bool NestedParallelism =
false;
728 KernelInfoState() =
default;
729 KernelInfoState(
bool BestState) {
738 bool isAtFixpoint()
const override {
return IsAtFixpoint; }
743 ParallelLevels.indicatePessimisticFixpoint();
744 ReachingKernelEntries.indicatePessimisticFixpoint();
745 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
746 ReachedKnownParallelRegions.indicatePessimisticFixpoint();
747 ReachedUnknownParallelRegions.indicatePessimisticFixpoint();
748 NestedParallelism =
true;
755 ParallelLevels.indicateOptimisticFixpoint();
756 ReachingKernelEntries.indicateOptimisticFixpoint();
757 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
758 ReachedKnownParallelRegions.indicateOptimisticFixpoint();
759 ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
764 KernelInfoState &getAssumed() {
return *
this; }
765 const KernelInfoState &getAssumed()
const {
return *
this; }
767 bool operator==(
const KernelInfoState &RHS)
const {
768 if (SPMDCompatibilityTracker !=
RHS.SPMDCompatibilityTracker)
770 if (ReachedKnownParallelRegions !=
RHS.ReachedKnownParallelRegions)
772 if (ReachedUnknownParallelRegions !=
RHS.ReachedUnknownParallelRegions)
774 if (ReachingKernelEntries !=
RHS.ReachingKernelEntries)
776 if (ParallelLevels !=
RHS.ParallelLevels)
778 if (NestedParallelism !=
RHS.NestedParallelism)
784 bool mayContainParallelRegion() {
785 return !ReachedKnownParallelRegions.empty() ||
786 !ReachedUnknownParallelRegions.empty();
790 static KernelInfoState getBestState() {
return KernelInfoState(
true); }
792 static KernelInfoState getBestState(KernelInfoState &KIS) {
793 return getBestState();
797 static KernelInfoState getWorstState() {
return KernelInfoState(
false); }
800 KernelInfoState
operator^=(
const KernelInfoState &KIS) {
802 if (KIS.KernelInitCB) {
803 if (KernelInitCB && KernelInitCB != KIS.KernelInitCB)
806 KernelInitCB = KIS.KernelInitCB;
808 if (KIS.KernelDeinitCB) {
809 if (KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB)
812 KernelDeinitCB = KIS.KernelDeinitCB;
814 if (KIS.KernelEnvC) {
815 if (KernelEnvC && KernelEnvC != KIS.KernelEnvC)
818 KernelEnvC = KIS.KernelEnvC;
820 SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;
821 ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;
822 ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;
823 NestedParallelism |= KIS.NestedParallelism;
827 KernelInfoState
operator&=(
const KernelInfoState &KIS) {
828 return (*
this ^= KIS);
844 OffloadArray() =
default;
851 if (!
Array.getAllocatedType()->isArrayTy())
854 if (!getValues(Array,
Before))
857 this->Array = &
Array;
861 static const unsigned DeviceIDArgNum = 1;
862 static const unsigned BasePtrsArgNum = 3;
863 static const unsigned PtrsArgNum = 4;
864 static const unsigned SizesArgNum = 5;
872 const uint64_t NumValues =
Array.getAllocatedType()->getArrayNumElements();
873 StoredValues.
assign(NumValues,
nullptr);
874 LastAccesses.
assign(NumValues,
nullptr);
879 if (BB !=
Before.getParent())
889 if (!isa<StoreInst>(&
I))
892 auto *S = cast<StoreInst>(&
I);
899 LastAccesses[
Idx] = S;
909 const unsigned NumValues = StoredValues.
size();
910 for (
unsigned I = 0;
I < NumValues; ++
I) {
911 if (!StoredValues[
I] || !LastAccesses[
I])
921 using OptimizationRemarkGetter =
925 OptimizationRemarkGetter OREGetter,
926 OMPInformationCache &OMPInfoCache,
Attributor &A)
928 OREGetter(OREGetter), OMPInfoCache(OMPInfoCache),
A(
A) {}
931 bool remarksEnabled() {
932 auto &Ctx =
M.getContext();
933 return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(
DEBUG_TYPE);
937 bool run(
bool IsModulePass) {
941 bool Changed =
false;
947 Changed |= runAttributor(IsModulePass);
950 OMPInfoCache.recollectUses();
953 Changed |= rewriteDeviceCodeStateMachine();
955 if (remarksEnabled())
956 analysisGlobalization();
963 Changed |= runAttributor(IsModulePass);
966 OMPInfoCache.recollectUses();
968 Changed |= deleteParallelRegions();
971 Changed |= hideMemTransfersLatency();
972 Changed |= deduplicateRuntimeCalls();
974 if (mergeParallelRegions()) {
975 deduplicateRuntimeCalls();
981 if (OMPInfoCache.OpenMPPostLink)
982 Changed |= removeRuntimeSymbols();
989 void printICVs()
const {
994 for (
auto ICV : ICVs) {
995 auto ICVInfo = OMPInfoCache.ICVs[ICV];
997 return ORA <<
"OpenMP ICV " <<
ore::NV(
"OpenMPICV", ICVInfo.Name)
999 << (ICVInfo.InitValue
1000 ?
toString(ICVInfo.InitValue->getValue(), 10,
true)
1001 :
"IMPLEMENTATION_DEFINED");
1004 emitRemark<OptimizationRemarkAnalysis>(
F,
"OpenMPICVTracker",
Remark);
1010 void printKernels()
const {
1016 return ORA <<
"OpenMP GPU kernel "
1017 <<
ore::NV(
"OpenMPGPUKernel",
F->getName()) <<
"\n";
1020 emitRemark<OptimizationRemarkAnalysis>(
F,
"OpenMPGPU",
Remark);
1026 static CallInst *getCallIfRegularCall(
1027 Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI =
nullptr) {
1028 CallInst *CI = dyn_cast<CallInst>(
U.getUser());
1038 static CallInst *getCallIfRegularCall(
1039 Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI =
nullptr) {
1040 CallInst *CI = dyn_cast<CallInst>(&V);
1050 bool mergeParallelRegions() {
1051 const unsigned CallbackCalleeOperand = 2;
1052 const unsigned CallbackFirstArgOperand = 3;
1056 OMPInformationCache::RuntimeFunctionInfo &RFI =
1057 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1059 if (!RFI.Declaration)
1063 OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
1064 OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],
1065 OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],
1068 bool Changed =
false;
1074 BasicBlock *StartBB =
nullptr, *EndBB =
nullptr;
1075 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
1076 BasicBlock *CGStartBB = CodeGenIP.getBlock();
1078 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
1079 assert(StartBB !=
nullptr &&
"StartBB should not be null");
1081 assert(EndBB !=
nullptr &&
"EndBB should not be null");
1082 EndBB->getTerminator()->setSuccessor(0, CGEndBB);
1085 auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
Value &,
1086 Value &Inner,
Value *&ReplacementValue) -> InsertPointTy {
1087 ReplacementValue = &Inner;
1091 auto FiniCB = [&](InsertPointTy CodeGenIP) {};
1095 auto CreateSequentialRegion = [&](
Function *OuterFn,
1103 SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);
1107 SplitBlock(ParentBB, SeqStartI, DT, LI,
nullptr,
"seq.par.merged");
1110 "Expected a different CFG");
1114 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
1115 BasicBlock *CGStartBB = CodeGenIP.getBlock();
1117 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
1118 assert(SeqStartBB !=
nullptr &&
"SeqStartBB should not be null");
1120 assert(SeqEndBB !=
nullptr &&
"SeqEndBB should not be null");
1123 auto FiniCB = [&](InsertPointTy CodeGenIP) {};
1129 for (
User *Usr :
I.users()) {
1137 OutsideUsers.
insert(&UsrI);
1140 if (OutsideUsers.
empty())
1147 I.getType(),
DL.getAllocaAddrSpace(),
nullptr,
1148 I.getName() +
".seq.output.alloc", OuterFn->
front().
begin());
1152 new StoreInst(&
I, AllocaI, SeqStartBB->getTerminator()->getIterator());
1158 I.getName() +
".seq.output.load",
1165 InsertPointTy(ParentBB, ParentBB->
end()),
DL);
1166 InsertPointTy SeqAfterIP =
1167 OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB);
1169 OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel);
1188 assert(MergableCIs.
size() > 1 &&
"Assumed multiple mergable CIs");
1191 OR <<
"Parallel region merged with parallel region"
1192 << (MergableCIs.
size() > 2 ?
"s" :
"") <<
" at ";
1195 if (CI != MergableCIs.
back())
1201 emitRemark<OptimizationRemark>(MergableCIs.
front(),
"OMP150",
Remark);
1205 <<
" parallel regions in " << OriginalFn->
getName()
1209 EndBB =
SplitBlock(BB, MergableCIs.
back()->getNextNode(), DT, LI);
1211 SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
1215 assert(BB->getUniqueSuccessor() == StartBB &&
"Expected a different CFG");
1216 const DebugLoc DL = BB->getTerminator()->getDebugLoc();
1221 for (
auto *It = MergableCIs.
begin(), *
End = MergableCIs.
end() - 1;
1230 CreateSequentialRegion(OriginalFn, BB, ForkCI->
getNextNode(),
1241 InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.createParallel(
1242 Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB,
nullptr,
nullptr,
1243 OMP_PROC_BIND_default,
false);
1247 OMPInfoCache.OMPBuilder.finalize(OriginalFn);
1254 for (
auto *CI : MergableCIs) {
1256 FunctionType *FT = OMPInfoCache.OMPBuilder.ParallelTask;
1260 for (
unsigned U = CallbackFirstArgOperand, E = CI->
arg_size(); U < E;
1270 for (
unsigned U = CallbackFirstArgOperand, E = CI->
arg_size(); U < E;
1274 U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
1277 if (CI != MergableCIs.back()) {
1280 OMPInfoCache.OMPBuilder.createBarrier(
1289 assert(OutlinedFn != OriginalFn &&
"Outlining failed");
1290 CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn);
1291 CGUpdater.reanalyzeFunction(*OriginalFn);
1293 NumOpenMPParallelRegionsMerged += MergableCIs.size();
1301 CallInst *CI = getCallIfRegularCall(U, &RFI);
1308 RFI.foreachUse(SCC, DetectPRsCB);
1314 for (
auto &It : BB2PRMap) {
1315 auto &CIs = It.getSecond();
1330 auto IsMergable = [&](
Instruction &
I,
bool IsBeforeMergableRegion) {
1333 if (
I.isTerminator())
1336 if (!isa<CallInst>(&
I))
1340 if (IsBeforeMergableRegion) {
1342 if (!CalledFunction)
1349 for (
const auto &RFI : UnmergableCallsInfo) {
1350 if (CalledFunction == RFI.Declaration)
1358 if (!isa<IntrinsicInst>(CI))
1369 if (CIs.count(&
I)) {
1375 if (IsMergable(
I, MergableCIs.
empty()))
1380 for (; It !=
End; ++It) {
1382 if (CIs.count(&SkipI)) {
1384 <<
" due to " <<
I <<
"\n");
1391 if (MergableCIs.
size() > 1) {
1392 MergableCIsVector.
push_back(MergableCIs);
1394 <<
" parallel regions in block " << BB->
getName()
1399 MergableCIs.
clear();
1402 if (!MergableCIsVector.
empty()) {
1405 for (
auto &MergableCIs : MergableCIsVector)
1406 Merge(MergableCIs, BB);
1407 MergableCIsVector.clear();
1414 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call);
1415 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier);
1416 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master);
1417 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master);
1424 bool deleteParallelRegions() {
1425 const unsigned CallbackCalleeOperand = 2;
1427 OMPInformationCache::RuntimeFunctionInfo &RFI =
1428 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1430 if (!RFI.Declaration)
1433 bool Changed =
false;
1435 CallInst *CI = getCallIfRegularCall(U);
1438 auto *Fn = dyn_cast<Function>(
1442 if (!Fn->onlyReadsMemory())
1444 if (!Fn->hasFnAttribute(Attribute::WillReturn))
1451 return OR <<
"Removing parallel region with no side-effects.";
1453 emitRemark<OptimizationRemark>(CI,
"OMP160",
Remark);
1457 ++NumOpenMPParallelRegionsDeleted;
1461 RFI.foreachUse(SCC, DeleteCallCB);
1467 bool deduplicateRuntimeCalls() {
1468 bool Changed =
false;
1471 OMPRTL_omp_get_num_threads,
1472 OMPRTL_omp_in_parallel,
1473 OMPRTL_omp_get_cancellation,
1474 OMPRTL_omp_get_supported_active_levels,
1475 OMPRTL_omp_get_level,
1476 OMPRTL_omp_get_ancestor_thread_num,
1477 OMPRTL_omp_get_team_size,
1478 OMPRTL_omp_get_active_level,
1479 OMPRTL_omp_in_final,
1480 OMPRTL_omp_get_proc_bind,
1481 OMPRTL_omp_get_num_places,
1482 OMPRTL_omp_get_num_procs,
1483 OMPRTL_omp_get_place_num,
1484 OMPRTL_omp_get_partition_num_places,
1485 OMPRTL_omp_get_partition_place_nums};
1489 collectGlobalThreadIdArguments(GTIdArgs);
1491 <<
" global thread ID arguments\n");
1494 for (
auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
1495 Changed |= deduplicateRuntimeCalls(
1496 *
F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
1500 Value *GTIdArg =
nullptr;
1502 if (GTIdArgs.
count(&Arg)) {
1506 Changed |= deduplicateRuntimeCalls(
1507 *
F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);
1514 bool removeRuntimeSymbols() {
1520 if (!
GV->getType()->isPointerTy())
1528 GlobalVariable *Client = dyn_cast<GlobalVariable>(
C->stripPointerCasts());
1537 GV->eraseFromParent();
1550 bool hideMemTransfersLatency() {
1551 auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
1552 bool Changed =
false;
1554 auto *RTCall = getCallIfRegularCall(U, &RFI);
1558 OffloadArray OffloadArrays[3];
1559 if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
1562 LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));
1565 bool WasSplit =
false;
1566 Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
1567 if (WaitMovementPoint)
1568 WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);
1570 Changed |= WasSplit;
1573 if (OMPInfoCache.runtimeFnsAvailable(
1574 {OMPRTL___tgt_target_data_begin_mapper_issue,
1575 OMPRTL___tgt_target_data_begin_mapper_wait}))
1576 RFI.foreachUse(SCC, SplitMemTransfers);
1581 void analysisGlobalization() {
1582 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
1584 auto CheckGlobalization = [&](
Use &
U,
Function &Decl) {
1585 if (
CallInst *CI = getCallIfRegularCall(U, &RFI)) {
1588 <<
"Found thread data sharing on the GPU. "
1589 <<
"Expect degraded performance due to data globalization.";
1591 emitRemark<OptimizationRemarkMissed>(CI,
"OMP112",
Remark);
1597 RFI.foreachUse(SCC, CheckGlobalization);
1602 bool getValuesInOffloadArrays(
CallInst &RuntimeCall,
1604 assert(OAs.
size() == 3 &&
"Need space for three offload arrays!");
1614 Value *BasePtrsArg =
1623 if (!isa<AllocaInst>(V))
1625 auto *BasePtrsArray = cast<AllocaInst>(V);
1626 if (!OAs[0].
initialize(*BasePtrsArray, RuntimeCall))
1631 if (!isa<AllocaInst>(V))
1633 auto *PtrsArray = cast<AllocaInst>(V);
1634 if (!OAs[1].
initialize(*PtrsArray, RuntimeCall))
1640 if (isa<GlobalValue>(V))
1641 return isa<Constant>(V);
1642 if (!isa<AllocaInst>(V))
1645 auto *SizesArray = cast<AllocaInst>(V);
1646 if (!OAs[2].
initialize(*SizesArray, RuntimeCall))
1657 assert(OAs.
size() == 3 &&
"There are three offload arrays to debug!");
1660 std::string ValuesStr;
1662 std::string Separator =
" --- ";
1664 for (
auto *BP : OAs[0].StoredValues) {
1668 LLVM_DEBUG(
dbgs() <<
"\t\toffload_baseptrs: " << ValuesStr <<
"\n");
1671 for (
auto *
P : OAs[1].StoredValues) {
1678 for (
auto *S : OAs[2].StoredValues) {
1682 LLVM_DEBUG(
dbgs() <<
"\t\toffload_sizes: " << ValuesStr <<
"\n");
1692 bool IsWorthIt =
false;
1711 return RuntimeCall.
getParent()->getTerminator();
1715 bool splitTargetDataBeginRTC(
CallInst &RuntimeCall,
1720 auto &
IRBuilder = OMPInfoCache.OMPBuilder;
1724 Entry.getFirstNonPHIOrDbgOrAlloca());
1726 IRBuilder.AsyncInfo,
nullptr,
"handle");
1734 M, OMPRTL___tgt_target_data_begin_mapper_issue);
1738 for (
auto &Arg : RuntimeCall.
args())
1739 Args.push_back(Arg.get());
1740 Args.push_back(Handle);
1744 OMPInfoCache.setCallingConvention(IssueDecl, IssueCallsite);
1750 M, OMPRTL___tgt_target_data_begin_mapper_wait);
1752 Value *WaitParams[2] = {
1754 OffloadArray::DeviceIDArgNum),
1758 WaitDecl, WaitParams,
"", WaitMovementPoint.
getIterator());
1759 OMPInfoCache.setCallingConvention(WaitDecl, WaitCallsite);
1764 static Value *combinedIdentStruct(
Value *CurrentIdent,
Value *NextIdent,
1765 bool GlobalOnly,
bool &SingleChoice) {
1766 if (CurrentIdent == NextIdent)
1767 return CurrentIdent;
1771 if (!GlobalOnly || isa<GlobalValue>(NextIdent)) {
1772 SingleChoice = !CurrentIdent;
1784 getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
1786 bool SingleChoice =
true;
1787 Value *Ident =
nullptr;
1789 CallInst *CI = getCallIfRegularCall(U, &RFI);
1790 if (!CI || &
F != &Caller)
1793 true, SingleChoice);
1796 RFI.foreachUse(SCC, CombineIdentStruct);
1798 if (!Ident || !SingleChoice) {
1801 if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
1803 &
F.getEntryBlock(),
F.getEntryBlock().begin()));
1808 OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1809 Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc, SrcLocStrSize);
1816 bool deduplicateRuntimeCalls(
Function &
F,
1817 OMPInformationCache::RuntimeFunctionInfo &RFI,
1818 Value *ReplVal =
nullptr) {
1819 auto *UV = RFI.getUseVector(
F);
1820 if (!UV || UV->size() + (ReplVal !=
nullptr) < 2)
1824 dbgs() <<
TAG <<
"Deduplicate " << UV->size() <<
" uses of " << RFI.Name
1825 << (ReplVal ?
" with an existing value\n" :
"\n") <<
"\n");
1827 assert((!ReplVal || (isa<Argument>(ReplVal) &&
1828 cast<Argument>(ReplVal)->
getParent() == &
F)) &&
1829 "Unexpected replacement value!");
1832 auto CanBeMoved = [
this](
CallBase &CB) {
1833 unsigned NumArgs = CB.arg_size();
1836 if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
1838 for (
unsigned U = 1;
U < NumArgs; ++
U)
1839 if (isa<Instruction>(CB.getArgOperand(U)))
1850 for (
Use *U : *UV) {
1851 if (
CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
1856 if (!CanBeMoved(*CI))
1864 assert(IP &&
"Expected insertion point!");
1865 cast<Instruction>(ReplVal)->moveBefore(IP);
1871 if (
CallBase *CI = dyn_cast<CallBase>(ReplVal)) {
1874 Value *Ident = getCombinedIdentFromCallUsesIn(RFI,
F,
1880 bool Changed =
false;
1882 CallInst *CI = getCallIfRegularCall(U, &RFI);
1883 if (!CI || CI == ReplVal || &
F != &Caller)
1888 return OR <<
"OpenMP runtime call "
1889 <<
ore::NV(
"OpenMPOptRuntime", RFI.Name) <<
" deduplicated.";
1892 emitRemark<OptimizationRemark>(CI,
"OMP170",
Remark);
1894 emitRemark<OptimizationRemark>(&
F,
"OMP170",
Remark);
1898 ++NumOpenMPRuntimeCallsDeduplicated;
1902 RFI.foreachUse(SCC, ReplaceAndDeleteCB);
1916 if (!
F.hasLocalLinkage())
1918 for (
Use &U :
F.uses()) {
1919 if (
CallInst *CI = getCallIfRegularCall(U)) {
1921 if (CI == &RefCI || GTIdArgs.
count(ArgOp) ||
1922 getCallIfRegularCall(
1923 *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))
1932 auto AddUserArgs = [&](
Value >Id) {
1933 for (
Use &U : GTId.uses())
1934 if (
CallInst *CI = dyn_cast<CallInst>(
U.getUser()))
1937 if (CallArgOpIsGTId(*Callee,
U.getOperandNo(), *CI))
1942 OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
1943 OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];
1945 GlobThreadNumRFI.foreachUse(SCC, [&](
Use &U,
Function &
F) {
1946 if (
CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))
1954 for (
unsigned U = 0;
U < GTIdArgs.
size(); ++
U)
1955 AddUserArgs(*GTIdArgs[U]);
1970 return getUniqueKernelFor(*
I.getFunction());
1975 bool rewriteDeviceCodeStateMachine();
1991 template <
typename RemarkKind,
typename RemarkCallBack>
1993 RemarkCallBack &&RemarkCB)
const {
1995 auto &ORE = OREGetter(
F);
1999 return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
I))
2000 <<
" [" << RemarkName <<
"]";
2004 [&]() {
return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
I)); });
2008 template <
typename RemarkKind,
typename RemarkCallBack>
2010 RemarkCallBack &&RemarkCB)
const {
2011 auto &ORE = OREGetter(
F);
2015 return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
F))
2016 <<
" [" << RemarkName <<
"]";
2020 [&]() {
return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
F)); });
2034 OptimizationRemarkGetter OREGetter;
2037 OMPInformationCache &OMPInfoCache;
2043 bool runAttributor(
bool IsModulePass) {
2047 registerAAs(IsModulePass);
2052 <<
" functions, result: " << Changed <<
".\n");
2054 if (Changed == ChangeStatus::CHANGED)
2055 OMPInfoCache.invalidateAnalyses();
2057 return Changed == ChangeStatus::CHANGED;
2064 void registerAAs(
bool IsModulePass);
2073 if (OMPInfoCache.CGSCC && !OMPInfoCache.CGSCC->empty() &&
2074 !OMPInfoCache.CGSCC->contains(&
F))
2079 std::optional<Kernel> &CachedKernel = UniqueKernelMap[&
F];
2081 return *CachedKernel;
2088 return *CachedKernel;
2091 CachedKernel =
nullptr;
2092 if (!
F.hasLocalLinkage()) {
2096 return ORA <<
"Potentially unknown OpenMP target region caller.";
2098 emitRemark<OptimizationRemarkAnalysis>(&
F,
"OMP100",
Remark);
2104 auto GetUniqueKernelForUse = [&](
const Use &
U) ->
Kernel {
2105 if (
auto *Cmp = dyn_cast<ICmpInst>(
U.getUser())) {
2107 if (
Cmp->isEquality())
2108 return getUniqueKernelFor(*Cmp);
2111 if (
auto *CB = dyn_cast<CallBase>(
U.getUser())) {
2113 if (CB->isCallee(&U))
2114 return getUniqueKernelFor(*CB);
2116 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
2117 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
2119 if (OpenMPOpt::getCallIfRegularCall(*
U.getUser(), &KernelParallelRFI))
2120 return getUniqueKernelFor(*CB);
2129 OMPInformationCache::foreachUse(
F, [&](
const Use &U) {
2130 PotentialKernels.
insert(GetUniqueKernelForUse(U));
2134 if (PotentialKernels.
size() == 1)
2135 K = *PotentialKernels.
begin();
2138 UniqueKernelMap[&
F] =
K;
2143bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
2144 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
2145 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
2147 bool Changed =
false;
2148 if (!KernelParallelRFI)
2159 bool UnknownUse =
false;
2160 bool KernelParallelUse =
false;
2161 unsigned NumDirectCalls = 0;
2164 OMPInformationCache::foreachUse(*
F, [&](
Use &U) {
2165 if (
auto *CB = dyn_cast<CallBase>(
U.getUser()))
2166 if (CB->isCallee(&U)) {
2171 if (isa<ICmpInst>(
U.getUser())) {
2172 ToBeReplacedStateMachineUses.push_back(&U);
2178 OpenMPOpt::getCallIfRegularCall(*
U.getUser(), &KernelParallelRFI);
2179 const unsigned int WrapperFunctionArgNo = 6;
2180 if (!KernelParallelUse && CI &&
2182 KernelParallelUse = true;
2183 ToBeReplacedStateMachineUses.push_back(&U);
2191 if (!KernelParallelUse)
2197 if (UnknownUse || NumDirectCalls != 1 ||
2198 ToBeReplacedStateMachineUses.
size() > 2) {
2200 return ORA <<
"Parallel region is used in "
2201 << (UnknownUse ?
"unknown" :
"unexpected")
2202 <<
" ways. Will not attempt to rewrite the state machine.";
2204 emitRemark<OptimizationRemarkAnalysis>(
F,
"OMP101",
Remark);
2213 return ORA <<
"Parallel region is not called from a unique kernel. "
2214 "Will not attempt to rewrite the state machine.";
2216 emitRemark<OptimizationRemarkAnalysis>(
F,
"OMP102",
Remark);
2232 for (
Use *U : ToBeReplacedStateMachineUses)
2234 ID,
U->get()->getType()));
2236 ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
2245struct AAICVTracker :
public StateWrapper<BooleanState, AbstractAttribute> {
2250 bool isAssumedTracked()
const {
return getAssumed(); }
2253 bool isKnownTracked()
const {
return getAssumed(); }
2262 return std::nullopt;
2268 virtual std::optional<Value *>
2276 const std::string
getName()
const override {
return "AAICVTracker"; }
2279 const char *getIdAddr()
const override {
return &
ID; }
2286 static const char ID;
2289struct AAICVTrackerFunction :
public AAICVTracker {
2291 : AAICVTracker(IRP,
A) {}
2294 const std::string getAsStr(
Attributor *)
const override {
2295 return "ICVTrackerFunction";
2299 void trackStatistics()
const override {}
2303 return ChangeStatus::UNCHANGED;
2308 InternalControlVar::ICV___last>
2309 ICVReplacementValuesMap;
2316 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2319 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2321 auto &ValuesMap = ICVReplacementValuesMap[ICV];
2323 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
2329 if (ValuesMap.insert(std::make_pair(CI, CI->
getArgOperand(0))).second)
2330 HasChanged = ChangeStatus::CHANGED;
2336 std::optional<Value *> ReplVal = getValueForCall(
A,
I, ICV);
2337 if (ReplVal && ValuesMap.insert(std::make_pair(&
I, *ReplVal)).second)
2338 HasChanged = ChangeStatus::CHANGED;
2344 SetterRFI.foreachUse(TrackValues,
F);
2346 bool UsedAssumedInformation =
false;
2347 A.checkForAllInstructions(CallCheck, *
this, {Instruction::Call},
2348 UsedAssumedInformation,
2354 if (HasChanged == ChangeStatus::CHANGED && !ValuesMap.count(Entry))
2355 ValuesMap.insert(std::make_pair(Entry,
nullptr));
2366 const auto *CB = dyn_cast<CallBase>(&
I);
2367 if (!CB || CB->hasFnAttr(
"no_openmp") ||
2368 CB->hasFnAttr(
"no_openmp_routines"))
2369 return std::nullopt;
2371 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2372 auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
2373 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2374 Function *CalledFunction = CB->getCalledFunction();
2377 if (CalledFunction ==
nullptr)
2379 if (CalledFunction == GetterRFI.Declaration)
2380 return std::nullopt;
2381 if (CalledFunction == SetterRFI.Declaration) {
2382 if (ICVReplacementValuesMap[ICV].
count(&
I))
2383 return ICVReplacementValuesMap[ICV].
lookup(&
I);
2392 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2395 if (ICVTrackingAA->isAssumedTracked()) {
2396 std::optional<Value *> URV =
2397 ICVTrackingAA->getUniqueReplacementValue(ICV);
2408 std::optional<Value *>
2410 return std::nullopt;
2417 const auto &ValuesMap = ICVReplacementValuesMap[ICV];
2418 if (ValuesMap.count(
I))
2419 return ValuesMap.lookup(
I);
2425 std::optional<Value *> ReplVal;
2427 while (!Worklist.
empty()) {
2429 if (!Visited.
insert(CurrInst).second)
2437 if (ValuesMap.count(CurrInst)) {
2438 std::optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst);
2441 ReplVal = NewReplVal;
2447 if (ReplVal != NewReplVal)
2453 std::optional<Value *> NewReplVal = getValueForCall(
A, *CurrInst, ICV);
2459 ReplVal = NewReplVal;
2465 if (ReplVal != NewReplVal)
2470 if (CurrBB ==
I->getParent() && ReplVal)
2475 if (
const Instruction *Terminator = Pred->getTerminator())
2483struct AAICVTrackerFunctionReturned : AAICVTracker {
2485 : AAICVTracker(IRP,
A) {}
2488 const std::string getAsStr(
Attributor *)
const override {
2489 return "ICVTrackerFunctionReturned";
2493 void trackStatistics()
const override {}
2497 return ChangeStatus::UNCHANGED;
2502 InternalControlVar::ICV___last>
2503 ICVReplacementValuesMap;
2506 std::optional<Value *>
2508 return ICVReplacementValuesMap[ICV];
2513 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2516 if (!ICVTrackingAA->isAssumedTracked())
2517 return indicatePessimisticFixpoint();
2520 std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2521 std::optional<Value *> UniqueICVValue;
2524 std::optional<Value *> NewReplVal =
2525 ICVTrackingAA->getReplacementValue(ICV, &
I,
A);
2528 if (UniqueICVValue && UniqueICVValue != NewReplVal)
2531 UniqueICVValue = NewReplVal;
2536 bool UsedAssumedInformation =
false;
2537 if (!
A.checkForAllInstructions(CheckReturnInst, *
this, {Instruction::Ret},
2538 UsedAssumedInformation,
2540 UniqueICVValue =
nullptr;
2542 if (UniqueICVValue == ReplVal)
2545 ReplVal = UniqueICVValue;
2546 Changed = ChangeStatus::CHANGED;
2553struct AAICVTrackerCallSite : AAICVTracker {
2555 : AAICVTracker(IRP,
A) {}
2558 assert(getAnchorScope() &&
"Expected anchor function");
2562 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2564 auto ICVInfo = OMPInfoCache.ICVs[ICV];
2565 auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter];
2566 if (Getter.Declaration == getAssociatedFunction()) {
2567 AssociatedICV = ICVInfo.Kind;
2573 indicatePessimisticFixpoint();
2577 if (!ReplVal || !*ReplVal)
2578 return ChangeStatus::UNCHANGED;
2581 A.deleteAfterManifest(*getCtxI());
2583 return ChangeStatus::CHANGED;
2587 const std::string getAsStr(
Attributor *)
const override {
2588 return "ICVTrackerCallSite";
2592 void trackStatistics()
const override {}
2595 std::optional<Value *> ReplVal;
2598 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2602 if (!ICVTrackingAA->isAssumedTracked())
2603 return indicatePessimisticFixpoint();
2605 std::optional<Value *> NewReplVal =
2606 ICVTrackingAA->getReplacementValue(AssociatedICV, getCtxI(),
A);
2608 if (ReplVal == NewReplVal)
2609 return ChangeStatus::UNCHANGED;
2611 ReplVal = NewReplVal;
2612 return ChangeStatus::CHANGED;
2617 std::optional<Value *>
2623struct AAICVTrackerCallSiteReturned : AAICVTracker {
2625 : AAICVTracker(IRP,
A) {}
2628 const std::string getAsStr(
Attributor *)
const override {
2629 return "ICVTrackerCallSiteReturned";
2633 void trackStatistics()
const override {}
2637 return ChangeStatus::UNCHANGED;
2642 InternalControlVar::ICV___last>
2643 ICVReplacementValuesMap;
2647 std::optional<Value *>
2649 return ICVReplacementValuesMap[ICV];
2654 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2656 DepClassTy::REQUIRED);
2659 if (!ICVTrackingAA->isAssumedTracked())
2660 return indicatePessimisticFixpoint();
2663 std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2664 std::optional<Value *> NewReplVal =
2665 ICVTrackingAA->getUniqueReplacementValue(ICV);
2667 if (ReplVal == NewReplVal)
2670 ReplVal = NewReplVal;
2671 Changed = ChangeStatus::CHANGED;
2679static bool hasFunctionEndAsUniqueSuccessor(
const BasicBlock *BB) {
2685 return hasFunctionEndAsUniqueSuccessor(
Successor);
2692 ~AAExecutionDomainFunction() {
delete RPOT; }
2696 assert(
F &&
"Expected anchor function");
2701 unsigned TotalBlocks = 0, InitialThreadBlocks = 0, AlignedBlocks = 0;
2702 for (
auto &It : BEDMap) {
2706 InitialThreadBlocks += It.getSecond().IsExecutedByInitialThreadOnly;
2707 AlignedBlocks += It.getSecond().IsReachedFromAlignedBarrierOnly &&
2708 It.getSecond().IsReachingAlignedBarrierOnly;
2710 return "[AAExecutionDomain] " + std::to_string(InitialThreadBlocks) +
"/" +
2711 std::to_string(AlignedBlocks) +
" of " +
2712 std::to_string(TotalBlocks) +
2713 " executed by initial thread / aligned";
2725 << BB.
getName() <<
" is executed by a single thread.\n";
2735 auto HandleAlignedBarrier = [&](
CallBase *CB) {
2736 const ExecutionDomainTy &ED = CB ? CEDMap[{CB, PRE}] : BEDMap[
nullptr];
2737 if (!ED.IsReachedFromAlignedBarrierOnly ||
2738 ED.EncounteredNonLocalSideEffect)
2740 if (!ED.EncounteredAssumes.empty() && !
A.isModulePass())
2751 DeletedBarriers.
insert(CB);
2752 A.deleteAfterManifest(*CB);
2753 ++NumBarriersEliminated;
2755 }
else if (!ED.AlignedBarriers.empty()) {
2758 ED.AlignedBarriers.end());
2760 while (!Worklist.
empty()) {
2762 if (!Visited.
insert(LastCB))
2766 if (!hasFunctionEndAsUniqueSuccessor(LastCB->
getParent()))
2768 if (!DeletedBarriers.
count(LastCB)) {
2769 ++NumBarriersEliminated;
2770 A.deleteAfterManifest(*LastCB);
2776 const ExecutionDomainTy &LastED = CEDMap[{LastCB, PRE}];
2777 Worklist.
append(LastED.AlignedBarriers.begin(),
2778 LastED.AlignedBarriers.end());
2784 if (!ED.EncounteredAssumes.empty() && (CB || !ED.AlignedBarriers.empty()))
2785 for (
auto *AssumeCB : ED.EncounteredAssumes)
2786 A.deleteAfterManifest(*AssumeCB);
2789 for (
auto *CB : AlignedBarriers)
2790 HandleAlignedBarrier(CB);
2794 HandleAlignedBarrier(
nullptr);
2806 mergeInPredecessorBarriersAndAssumptions(
Attributor &
A, ExecutionDomainTy &ED,
2807 const ExecutionDomainTy &PredED);
2812 bool mergeInPredecessor(
Attributor &
A, ExecutionDomainTy &ED,
2813 const ExecutionDomainTy &PredED,
2814 bool InitialEdgeOnly =
false);
2817 bool handleCallees(
Attributor &
A, ExecutionDomainTy &EntryBBED);
2827 assert(BB.
getParent() == getAnchorScope() &&
"Block is out of scope!");
2828 return BEDMap.lookup(&BB).IsExecutedByInitialThreadOnly;
2833 assert(
I.getFunction() == getAnchorScope() &&
2834 "Instruction is out of scope!");
2838 bool ForwardIsOk =
true;
2844 auto *CB = dyn_cast<CallBase>(CurI);
2847 if (CB != &
I && AlignedBarriers.contains(
const_cast<CallBase *
>(CB)))
2849 const auto &It = CEDMap.find({CB, PRE});
2850 if (It == CEDMap.end())
2852 if (!It->getSecond().IsReachingAlignedBarrierOnly)
2853 ForwardIsOk =
false;
2857 if (!CurI && !BEDMap.lookup(
I.getParent()).IsReachingAlignedBarrierOnly)
2858 ForwardIsOk =
false;
2863 auto *CB = dyn_cast<CallBase>(CurI);
2866 if (CB != &
I && AlignedBarriers.contains(
const_cast<CallBase *
>(CB)))
2868 const auto &It = CEDMap.find({CB, POST});
2869 if (It == CEDMap.end())
2871 if (It->getSecond().IsReachedFromAlignedBarrierOnly)
2884 return BEDMap.lookup(
nullptr).IsReachedFromAlignedBarrierOnly;
2886 return BEDMap.lookup(PredBB).IsReachedFromAlignedBarrierOnly;
2898 "No request should be made against an invalid state!");
2899 return BEDMap.lookup(&BB);
2901 std::pair<ExecutionDomainTy, ExecutionDomainTy>
2904 "No request should be made against an invalid state!");
2905 return {CEDMap.lookup({&CB, PRE}), CEDMap.lookup({&CB, POST})};
2909 "No request should be made against an invalid state!");
2910 return InterProceduralED;
2924 if (!Cmp || !
Cmp->isTrueWhenEqual() || !
Cmp->isEquality())
2932 if (
C->isAllOnesValue()) {
2933 auto *CB = dyn_cast<CallBase>(
Cmp->getOperand(0));
2934 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2935 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
2936 CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr;
2942 KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);
2948 if (
auto *
II = dyn_cast<IntrinsicInst>(
Cmp->getOperand(0)))
2949 if (
II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_tid_x)
2953 if (
auto *
II = dyn_cast<IntrinsicInst>(
Cmp->getOperand(0)))
2954 if (
II->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x)
2962 ExecutionDomainTy InterProceduralED;
2974 static bool setAndRecord(
bool &R,
bool V) {
2985void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions(
2986 Attributor &
A, ExecutionDomainTy &ED,
const ExecutionDomainTy &PredED) {
2987 for (
auto *EA : PredED.EncounteredAssumes)
2988 ED.addAssumeInst(
A, *EA);
2990 for (
auto *AB : PredED.AlignedBarriers)
2991 ED.addAlignedBarrier(
A, *AB);
2994bool AAExecutionDomainFunction::mergeInPredecessor(
2995 Attributor &
A, ExecutionDomainTy &ED,
const ExecutionDomainTy &PredED,
2996 bool InitialEdgeOnly) {
2998 bool Changed =
false;
3000 setAndRecord(ED.IsExecutedByInitialThreadOnly,
3001 InitialEdgeOnly || (PredED.IsExecutedByInitialThreadOnly &&
3002 ED.IsExecutedByInitialThreadOnly));
3004 Changed |= setAndRecord(ED.IsReachedFromAlignedBarrierOnly,
3005 ED.IsReachedFromAlignedBarrierOnly &&
3006 PredED.IsReachedFromAlignedBarrierOnly);
3007 Changed |= setAndRecord(ED.EncounteredNonLocalSideEffect,
3008 ED.EncounteredNonLocalSideEffect |
3009 PredED.EncounteredNonLocalSideEffect);
3011 if (ED.IsReachedFromAlignedBarrierOnly)
3012 mergeInPredecessorBarriersAndAssumptions(
A, ED, PredED);
3014 ED.clearAssumeInstAndAlignedBarriers();
3018bool AAExecutionDomainFunction::handleCallees(
Attributor &
A,
3019 ExecutionDomainTy &EntryBBED) {
3024 DepClassTy::OPTIONAL);
3025 if (!EDAA || !EDAA->getState().isValidState())
3028 EDAA->getExecutionDomain(*cast<CallBase>(ACS.getInstruction())));
3032 ExecutionDomainTy ExitED;
3033 bool AllCallSitesKnown;
3034 if (
A.checkForAllCallSites(PredForCallSite, *
this,
3036 AllCallSitesKnown)) {
3037 for (
const auto &[CSInED, CSOutED] : CallSiteEDs) {
3038 mergeInPredecessor(
A, EntryBBED, CSInED);
3039 ExitED.IsReachingAlignedBarrierOnly &=
3040 CSOutED.IsReachingAlignedBarrierOnly;
3047 EntryBBED.IsExecutedByInitialThreadOnly =
false;
3048 EntryBBED.IsReachedFromAlignedBarrierOnly =
true;
3049 EntryBBED.EncounteredNonLocalSideEffect =
false;
3050 ExitED.IsReachingAlignedBarrierOnly =
false;
3052 EntryBBED.IsExecutedByInitialThreadOnly =
false;
3053 EntryBBED.IsReachedFromAlignedBarrierOnly =
false;
3054 EntryBBED.EncounteredNonLocalSideEffect =
true;
3055 ExitED.IsReachingAlignedBarrierOnly =
false;
3059 bool Changed =
false;
3060 auto &FnED = BEDMap[
nullptr];
3061 Changed |= setAndRecord(FnED.IsReachedFromAlignedBarrierOnly,
3062 FnED.IsReachedFromAlignedBarrierOnly &
3063 EntryBBED.IsReachedFromAlignedBarrierOnly);
3064 Changed |= setAndRecord(FnED.IsReachingAlignedBarrierOnly,
3065 FnED.IsReachingAlignedBarrierOnly &
3066 ExitED.IsReachingAlignedBarrierOnly);
3067 Changed |= setAndRecord(FnED.IsExecutedByInitialThreadOnly,
3068 EntryBBED.IsExecutedByInitialThreadOnly);
3074 bool Changed =
false;
3079 auto HandleAlignedBarrier = [&](
CallBase &CB, ExecutionDomainTy &ED) {
3080 Changed |= AlignedBarriers.insert(&CB);
3082 auto &CallInED = CEDMap[{&CB, PRE}];
3083 Changed |= mergeInPredecessor(
A, CallInED, ED);
3084 CallInED.IsReachingAlignedBarrierOnly =
true;
3086 ED.EncounteredNonLocalSideEffect =
false;
3087 ED.IsReachedFromAlignedBarrierOnly =
true;
3089 ED.clearAssumeInstAndAlignedBarriers();
3090 ED.addAlignedBarrier(
A, CB);
3091 auto &CallOutED = CEDMap[{&CB, POST}];
3092 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3096 A.getAAFor<
AAIsDead>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
3103 for (
auto &RIt : *RPOT) {
3106 bool IsEntryBB = &BB == &EntryBB;
3109 bool AlignedBarrierLastInBlock = IsEntryBB && IsKernel;
3110 bool IsExplicitlyAligned = IsEntryBB && IsKernel;
3111 ExecutionDomainTy ED;
3114 Changed |= handleCallees(
A, ED);
3118 if (LivenessAA && LivenessAA->isAssumedDead(&BB))
3122 if (LivenessAA && LivenessAA->isEdgeDead(PredBB, &BB))
3124 bool InitialEdgeOnly = isInitialThreadOnlyEdge(
3125 A, dyn_cast<BranchInst>(PredBB->getTerminator()), BB);
3126 mergeInPredecessor(
A, ED, BEDMap[PredBB], InitialEdgeOnly);
3133 bool UsedAssumedInformation;
3134 if (
A.isAssumedDead(
I, *
this, LivenessAA, UsedAssumedInformation,
3135 false, DepClassTy::OPTIONAL,
3141 if (
auto *
II = dyn_cast<IntrinsicInst>(&
I)) {
3142 if (
auto *AI = dyn_cast_or_null<AssumeInst>(
II)) {
3143 ED.addAssumeInst(
A, *AI);
3147 if (
II->isAssumeLikeIntrinsic())
3151 if (
auto *FI = dyn_cast<FenceInst>(&
I)) {
3152 if (!ED.EncounteredNonLocalSideEffect) {
3154 if (ED.IsReachedFromAlignedBarrierOnly)
3159 case AtomicOrdering::NotAtomic:
3161 case AtomicOrdering::Unordered:
3163 case AtomicOrdering::Monotonic:
3165 case AtomicOrdering::Acquire:
3167 case AtomicOrdering::Release:
3169 case AtomicOrdering::AcquireRelease:
3171 case AtomicOrdering::SequentiallyConsistent:
3175 NonNoOpFences.insert(FI);
3178 auto *CB = dyn_cast<CallBase>(&
I);
3180 bool IsAlignedBarrier =
3184 AlignedBarrierLastInBlock &= IsNoSync;
3185 IsExplicitlyAligned &= IsNoSync;
3191 if (IsAlignedBarrier) {
3192 HandleAlignedBarrier(*CB, ED);
3193 AlignedBarrierLastInBlock =
true;
3194 IsExplicitlyAligned =
true;
3199 if (isa<MemIntrinsic>(&
I)) {
3200 if (!ED.EncounteredNonLocalSideEffect &&
3202 ED.EncounteredNonLocalSideEffect =
true;
3204 ED.IsReachedFromAlignedBarrierOnly =
false;
3212 auto &CallInED = CEDMap[{CB, PRE}];
3213 Changed |= mergeInPredecessor(
A, CallInED, ED);
3219 if (!IsNoSync && Callee && !
Callee->isDeclaration()) {
3222 if (EDAA && EDAA->getState().isValidState()) {
3225 CalleeED.IsReachedFromAlignedBarrierOnly;
3226 AlignedBarrierLastInBlock = ED.IsReachedFromAlignedBarrierOnly;
3227 if (IsNoSync || !CalleeED.IsReachedFromAlignedBarrierOnly)
3228 ED.EncounteredNonLocalSideEffect |=
3229 CalleeED.EncounteredNonLocalSideEffect;
3231 ED.EncounteredNonLocalSideEffect =
3232 CalleeED.EncounteredNonLocalSideEffect;
3233 if (!CalleeED.IsReachingAlignedBarrierOnly) {
3235 setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3238 if (CalleeED.IsReachedFromAlignedBarrierOnly)
3239 mergeInPredecessorBarriersAndAssumptions(
A, ED, CalleeED);
3240 auto &CallOutED = CEDMap[{CB, POST}];
3241 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3246 ED.IsReachedFromAlignedBarrierOnly =
false;
3247 Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3250 AlignedBarrierLastInBlock &= ED.IsReachedFromAlignedBarrierOnly;
3252 auto &CallOutED = CEDMap[{CB, POST}];
3253 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3256 if (!
I.mayHaveSideEffects() && !
I.mayReadFromMemory())
3270 if (MemAA && MemAA->getState().isValidState() &&
3271 MemAA->checkForAllAccessesToMemoryKind(
3276 auto &InfoCache =
A.getInfoCache();
3277 if (!
I.mayHaveSideEffects() && InfoCache.isOnlyUsedByAssume(
I))
3280 if (
auto *LI = dyn_cast<LoadInst>(&
I))
3281 if (LI->hasMetadata(LLVMContext::MD_invariant_load))
3284 if (!ED.EncounteredNonLocalSideEffect &&
3286 ED.EncounteredNonLocalSideEffect =
true;
3289 bool IsEndAndNotReachingAlignedBarriersOnly =
false;
3290 if (!isa<UnreachableInst>(BB.getTerminator()) &&
3291 !BB.getTerminator()->getNumSuccessors()) {
3293 Changed |= mergeInPredecessor(
A, InterProceduralED, ED);
3295 auto &FnED = BEDMap[
nullptr];
3296 if (IsKernel && !IsExplicitlyAligned)
3297 FnED.IsReachingAlignedBarrierOnly =
false;
3298 Changed |= mergeInPredecessor(
A, FnED, ED);
3300 if (!FnED.IsReachingAlignedBarrierOnly) {
3301 IsEndAndNotReachingAlignedBarriersOnly =
true;
3302 SyncInstWorklist.
push_back(BB.getTerminator());
3303 auto &BBED = BEDMap[&BB];
3304 Changed |= setAndRecord(BBED.IsReachingAlignedBarrierOnly,
false);
3308 ExecutionDomainTy &StoredED = BEDMap[&BB];
3309 ED.IsReachingAlignedBarrierOnly = StoredED.IsReachingAlignedBarrierOnly &
3310 !IsEndAndNotReachingAlignedBarriersOnly;
3316 if (ED.IsExecutedByInitialThreadOnly !=
3317 StoredED.IsExecutedByInitialThreadOnly ||
3318 ED.IsReachedFromAlignedBarrierOnly !=
3319 StoredED.IsReachedFromAlignedBarrierOnly ||
3320 ED.EncounteredNonLocalSideEffect !=
3321 StoredED.EncounteredNonLocalSideEffect)
3325 StoredED = std::move(ED);
3331 while (!SyncInstWorklist.
empty()) {
3334 bool HitAlignedBarrierOrKnownEnd =
false;
3336 auto *CB = dyn_cast<CallBase>(CurInst);
3339 auto &CallOutED = CEDMap[{CB, POST}];
3340 Changed |= setAndRecord(CallOutED.IsReachingAlignedBarrierOnly,
false);
3341 auto &CallInED = CEDMap[{CB, PRE}];
3342 HitAlignedBarrierOrKnownEnd =
3343 AlignedBarriers.count(CB) || !CallInED.IsReachingAlignedBarrierOnly;
3344 if (HitAlignedBarrierOrKnownEnd)
3346 Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3348 if (HitAlignedBarrierOrKnownEnd)
3352 if (LivenessAA && LivenessAA->isEdgeDead(PredBB, SyncBB))
3354 if (!Visited.
insert(PredBB))
3356 auto &PredED = BEDMap[PredBB];
3357 if (setAndRecord(PredED.IsReachingAlignedBarrierOnly,
false)) {
3359 SyncInstWorklist.
push_back(PredBB->getTerminator());
3362 if (SyncBB != &EntryBB)
3365 setAndRecord(InterProceduralED.IsReachingAlignedBarrierOnly,
false);
3368 return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
3373struct AAHeapToShared :
public StateWrapper<BooleanState, AbstractAttribute> {
3378 static AAHeapToShared &createForPosition(
const IRPosition &IRP,
3382 virtual bool isAssumedHeapToShared(
CallBase &CB)
const = 0;
3386 virtual bool isAssumedHeapToSharedRemovedFree(
CallBase &CB)
const = 0;
3389 const std::string
getName()
const override {
return "AAHeapToShared"; }
3392 const char *getIdAddr()
const override {
return &
ID; }
3401 static const char ID;
3404struct AAHeapToSharedFunction :
public AAHeapToShared {
3406 : AAHeapToShared(IRP,
A) {}
3408 const std::string getAsStr(
Attributor *)
const override {
3409 return "[AAHeapToShared] " + std::to_string(MallocCalls.size()) +
3410 " malloc calls eligible.";
3414 void trackStatistics()
const override {}
3418 void findPotentialRemovedFreeCalls(
Attributor &
A) {
3419 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3420 auto &FreeRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
3422 PotentialRemovedFreeCalls.clear();
3426 for (
auto *U : CB->
users()) {
3428 if (
C &&
C->getCalledFunction() == FreeRFI.Declaration)
3432 if (FreeCalls.
size() != 1)
3435 PotentialRemovedFreeCalls.insert(FreeCalls.
front());
3441 indicatePessimisticFixpoint();
3445 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3446 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3447 if (!RFI.Declaration)
3452 bool &) -> std::optional<Value *> {
return nullptr; };
3455 for (
User *U : RFI.Declaration->
users())
3456 if (
CallBase *CB = dyn_cast<CallBase>(U)) {
3459 MallocCalls.insert(CB);
3464 findPotentialRemovedFreeCalls(
A);
3467 bool isAssumedHeapToShared(
CallBase &CB)
const override {
3468 return isValidState() && MallocCalls.count(&CB);
3471 bool isAssumedHeapToSharedRemovedFree(
CallBase &CB)
const override {
3472 return isValidState() && PotentialRemovedFreeCalls.count(&CB);
3476 if (MallocCalls.empty())
3477 return ChangeStatus::UNCHANGED;
3479 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3480 auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
3484 DepClassTy::OPTIONAL);
3489 if (HS &&
HS->isAssumedHeapToStack(*CB))
3494 for (
auto *U : CB->
users()) {
3496 if (
C &&
C->getCalledFunction() == FreeCall.Declaration)
3499 if (FreeCalls.
size() != 1)
3506 <<
" with shared memory."
3507 <<
" Shared memory usage is limited to "
3513 <<
" with " << AllocSize->getZExtValue()
3514 <<
" bytes of shared memory\n");
3520 Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue());
3525 static_cast<unsigned>(AddressSpace::Shared));
3530 return OR <<
"Replaced globalized variable with "
3531 <<
ore::NV(
"SharedMemory", AllocSize->getZExtValue())
3532 << (AllocSize->isOne() ?
" byte " :
" bytes ")
3533 <<
"of shared memory.";
3539 "HeapToShared on allocation without alignment attribute");
3540 SharedMem->setAlignment(*Alignment);
3543 A.deleteAfterManifest(*CB);
3544 A.deleteAfterManifest(*FreeCalls.
front());
3546 SharedMemoryUsed += AllocSize->getZExtValue();
3547 NumBytesMovedToSharedMemory = SharedMemoryUsed;
3548 Changed = ChangeStatus::CHANGED;
3555 if (MallocCalls.empty())
3556 return indicatePessimisticFixpoint();
3557 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3558 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3559 if (!RFI.Declaration)
3560 return ChangeStatus::UNCHANGED;
3564 auto NumMallocCalls = MallocCalls.size();
3567 for (
User *U : RFI.Declaration->
users()) {
3568 if (
CallBase *CB = dyn_cast<CallBase>(U)) {
3569 if (CB->getCaller() !=
F)
3571 if (!MallocCalls.count(CB))
3573 if (!isa<ConstantInt>(CB->getArgOperand(0))) {
3574 MallocCalls.remove(CB);
3579 if (!ED || !ED->isExecutedByInitialThreadOnly(*CB))
3580 MallocCalls.remove(CB);
3584 findPotentialRemovedFreeCalls(
A);
3586 if (NumMallocCalls != MallocCalls.size())
3587 return ChangeStatus::CHANGED;
3589 return ChangeStatus::UNCHANGED;
3597 unsigned SharedMemoryUsed = 0;
3600struct AAKernelInfo :
public StateWrapper<KernelInfoState, AbstractAttribute> {
3606 static bool requiresCalleeForCallBase() {
return false; }
3609 void trackStatistics()
const override {}
3612 const std::string getAsStr(
Attributor *)
const override {
3613 if (!isValidState())
3615 return std::string(SPMDCompatibilityTracker.isAssumed() ?
"SPMD"
3617 std::string(SPMDCompatibilityTracker.isAtFixpoint() ?
" [FIX]"
3619 std::string(
" #PRs: ") +
3620 (ReachedKnownParallelRegions.isValidState()
3621 ? std::to_string(ReachedKnownParallelRegions.size())
3623 ", #Unknown PRs: " +
3624 (ReachedUnknownParallelRegions.isValidState()
3627 ", #Reaching Kernels: " +
3628 (ReachingKernelEntries.isValidState()
3632 (ParallelLevels.isValidState()
3635 ", NestedPar: " + (NestedParallelism ?
"yes" :
"no");
3642 const std::string
getName()
const override {
return "AAKernelInfo"; }
3645 const char *getIdAddr()
const override {
return &
ID; }
3652 static const char ID;
3657struct AAKernelInfoFunction : AAKernelInfo {
3659 : AAKernelInfo(IRP,
A) {}
3664 return GuardedInstructions;
3667 void setConfigurationOfKernelEnvironment(
ConstantStruct *ConfigC) {
3669 KernelEnvC, ConfigC, {KernelInfo::ConfigurationIdx});
3670 assert(NewKernelEnvC &&
"Failed to create new kernel environment");
3671 KernelEnvC = cast<ConstantStruct>(NewKernelEnvC);
3674#define KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MEMBER) \
3675 void set##MEMBER##OfKernelEnvironment(ConstantInt *NewVal) { \
3676 ConstantStruct *ConfigC = \
3677 KernelInfo::getConfigurationFromKernelEnvironment(KernelEnvC); \
3678 Constant *NewConfigC = ConstantFoldInsertValueInstruction( \
3679 ConfigC, NewVal, {KernelInfo::MEMBER##Idx}); \
3680 assert(NewConfigC && "Failed to create new configuration environment"); \
3681 setConfigurationOfKernelEnvironment(cast<ConstantStruct>(NewConfigC)); \
3692#undef KERNEL_ENVIRONMENT_CONFIGURATION_SETTER
3699 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3703 OMPInformationCache::RuntimeFunctionInfo &InitRFI =
3704 OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
3705 OMPInformationCache::RuntimeFunctionInfo &DeinitRFI =
3706 OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];
3710 auto StoreCallBase = [](
Use &U,
3711 OMPInformationCache::RuntimeFunctionInfo &RFI,
3713 CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI);
3715 "Unexpected use of __kmpc_target_init or __kmpc_target_deinit!");
3717 "Multiple uses of __kmpc_target_init or __kmpc_target_deinit!");
3723 StoreCallBase(U, InitRFI, KernelInitCB);
3727 DeinitRFI.foreachUse(
3729 StoreCallBase(U, DeinitRFI, KernelDeinitCB);
3735 if (!KernelInitCB || !KernelDeinitCB)
3739 ReachingKernelEntries.insert(Fn);
3740 IsKernelEntry =
true;
3748 KernelConfigurationSimplifyCB =
3750 bool &UsedAssumedInformation) -> std::optional<Constant *> {
3751 if (!isAtFixpoint()) {
3754 UsedAssumedInformation =
true;
3755 A.recordDependence(*
this, *AA, DepClassTy::OPTIONAL);
3760 A.registerGlobalVariableSimplificationCallback(
3761 *KernelEnvGV, KernelConfigurationSimplifyCB);
3764 bool CanChangeToSPMD = OMPInfoCache.runtimeFnsAvailable(
3765 {OMPRTL___kmpc_get_hardware_thread_id_in_block,
3766 OMPRTL___kmpc_barrier_simple_spmd});
3770 KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);
3775 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
3779 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3781 setExecModeOfKernelEnvironment(AssumedExecModeC);
3788 setMinThreadsOfKernelEnvironment(ConstantInt::get(Int32Ty, MinThreads));
3790 setMaxThreadsOfKernelEnvironment(ConstantInt::get(Int32Ty,
MaxThreads));
3791 auto [MinTeams, MaxTeams] =
3794 setMinTeamsOfKernelEnvironment(ConstantInt::get(Int32Ty, MinTeams));
3796 setMaxTeamsOfKernelEnvironment(ConstantInt::get(Int32Ty, MaxTeams));
3799 KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(KernelEnvC);
3800 ConstantInt *AssumedMayUseNestedParallelismC = ConstantInt::get(
3802 setMayUseNestedParallelismOfKernelEnvironment(
3803 AssumedMayUseNestedParallelismC);
3807 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
3810 ConstantInt::get(UseGenericStateMachineC->
getIntegerType(),
false);
3811 setUseGenericStateMachineOfKernelEnvironment(
3812 AssumedUseGenericStateMachineC);
3818 if (!OMPInfoCache.RFIs[RFKind].Declaration)
3820 A.registerVirtualUseCallback(*OMPInfoCache.RFIs[RFKind].Declaration, CB);
3824 auto AddDependence = [](
Attributor &
A,
const AAKernelInfo *KI,
3827 A.recordDependence(*KI, *QueryingAA, DepClassTy::OPTIONAL);
3841 if (SPMDCompatibilityTracker.isValidState())
3842 return AddDependence(
A,
this, QueryingAA);
3844 if (!ReachedKnownParallelRegions.isValidState())
3845 return AddDependence(
A,
this, QueryingAA);
3851 RegisterVirtualUse(OMPRTL___kmpc_get_hardware_num_threads_in_block,
3852 CustomStateMachineUseCB);
3853 RegisterVirtualUse(OMPRTL___kmpc_get_warp_size, CustomStateMachineUseCB);
3854 RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_generic,
3855 CustomStateMachineUseCB);
3856 RegisterVirtualUse(OMPRTL___kmpc_kernel_parallel,
3857 CustomStateMachineUseCB);
3858 RegisterVirtualUse(OMPRTL___kmpc_kernel_end_parallel,
3859 CustomStateMachineUseCB);
3863 if (SPMDCompatibilityTracker.isAtFixpoint())
3870 if (!SPMDCompatibilityTracker.isValidState())
3871 return AddDependence(
A,
this, QueryingAA);
3874 RegisterVirtualUse(OMPRTL___kmpc_get_hardware_thread_id_in_block,
3883 if (!SPMDCompatibilityTracker.isValidState())
3884 return AddDependence(
A,
this, QueryingAA);
3885 if (SPMDCompatibilityTracker.empty())
3886 return AddDependence(
A,
this, QueryingAA);
3887 if (!mayContainParallelRegion())
3888 return AddDependence(
A,
this, QueryingAA);
3891 RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_spmd, SPMDBarrierUseCB);
3895 static std::string sanitizeForGlobalName(std::string S) {
3899 return !((C >=
'a' && C <=
'z') || (C >=
'A' && C <=
'Z') ||
3900 (C >=
'0' && C <=
'9') || C ==
'_');
3911 if (!KernelInitCB || !KernelDeinitCB)
3912 return ChangeStatus::UNCHANGED;
3916 bool HasBuiltStateMachine =
true;
3917 if (!changeToSPMDMode(
A, Changed)) {
3919 HasBuiltStateMachine = buildCustomStateMachine(
A, Changed);
3921 HasBuiltStateMachine =
false;
3928 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
3929 ExistingKernelEnvC);
3930 if (!HasBuiltStateMachine)
3931 setUseGenericStateMachineOfKernelEnvironment(
3932 OldUseGenericStateMachineVal);
3939 Changed = ChangeStatus::CHANGED;
3945 void insertInstructionGuardsHelper(
Attributor &
A) {
3946 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3948 auto CreateGuardedRegion = [&](
Instruction *RegionStartI,
3982 DT, LI, MSU,
"region.guarded.end");
3985 MSU,
"region.barrier");
3988 DT, LI, MSU,
"region.exit");
3990 SplitBlock(ParentBB, RegionStartI, DT, LI, MSU,
"region.guarded");
3993 "Expected a different CFG");
3996 ParentBB, ParentBB->
getTerminator(), DT, LI, MSU,
"region.check.tid");
3999 A.registerManifestAddedBasicBlock(*RegionEndBB);
4000 A.registerManifestAddedBasicBlock(*RegionBarrierBB);
4001 A.registerManifestAddedBasicBlock(*RegionExitBB);
4002 A.registerManifestAddedBasicBlock(*RegionStartBB);
4003 A.registerManifestAddedBasicBlock(*RegionCheckTidBB);
4005 bool HasBroadcastValues =
false;
4010 for (
Use &U :
I.uses()) {
4016 if (OutsideUses.
empty())
4019 HasBroadcastValues =
true;
4024 M,
I.getType(),
false,
4026 sanitizeForGlobalName(
4027 (
I.getName() +
".guarded.output.alloc").str()),
4029 static_cast<unsigned>(AddressSpace::Shared));
4036 I.getType(), SharedMem,
I.getName() +
".guarded.output.load",
4040 for (
Use *U : OutsideUses)
4041 A.changeUseAfterManifest(*U, *LoadI);
4044 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4050 InsertPointTy(ParentBB, ParentBB->
end()),
DL);
4051 OMPInfoCache.OMPBuilder.updateToLocation(Loc);
4054 OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4056 OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4062 InsertPointTy(RegionCheckTidBB, RegionCheckTidBB->
end()),
DL);
4063 OMPInfoCache.OMPBuilder.updateToLocation(LocRegionCheckTid);
4065 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4066 M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
4068 OMPInfoCache.OMPBuilder.Builder.CreateCall(HardwareTidFn, {});
4070 OMPInfoCache.setCallingConvention(HardwareTidFn, Tid);
4071 Value *TidCheck = OMPInfoCache.OMPBuilder.Builder.CreateIsNull(Tid);
4072 OMPInfoCache.OMPBuilder.Builder
4073 .CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB)
4079 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4080 M, OMPRTL___kmpc_barrier_simple_spmd);
4081 OMPInfoCache.OMPBuilder.updateToLocation(InsertPointTy(
4084 OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid});
4086 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4089 if (HasBroadcastValues) {
4094 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4098 auto &AllocSharedRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
4100 for (
Instruction *GuardedI : SPMDCompatibilityTracker) {
4102 if (!Visited.
insert(BB).second)
4108 while (++IP != IPEnd) {
4109 if (!IP->mayHaveSideEffects() && !IP->mayReadFromMemory())
4112 if (OpenMPOpt::getCallIfRegularCall(*
I, &AllocSharedRFI))
4114 if (!
I->user_empty() || !SPMDCompatibilityTracker.contains(
I)) {
4115 LastEffect =
nullptr;
4122 for (
auto &Reorder : Reorders)
4128 for (
Instruction *GuardedI : SPMDCompatibilityTracker) {
4130 auto *CalleeAA =
A.lookupAAFor<AAKernelInfo>(
4133 assert(CalleeAA !=
nullptr &&
"Expected Callee AAKernelInfo");
4134 auto &CalleeAAFunction = *cast<AAKernelInfoFunction>(CalleeAA);
4136 if (CalleeAAFunction.getGuardedInstructions().contains(GuardedI))
4139 Instruction *GuardedRegionStart =
nullptr, *GuardedRegionEnd =
nullptr;
4143 if (SPMDCompatibilityTracker.contains(&
I)) {
4144 CalleeAAFunction.getGuardedInstructions().insert(&
I);
4145 if (GuardedRegionStart)
4146 GuardedRegionEnd = &
I;
4148 GuardedRegionStart = GuardedRegionEnd = &
I;
4155 if (GuardedRegionStart) {
4157 std::make_pair(GuardedRegionStart, GuardedRegionEnd));
4158 GuardedRegionStart =
nullptr;
4159 GuardedRegionEnd =
nullptr;
4164 for (
auto &GR : GuardedRegions)
4165 CreateGuardedRegion(GR.first, GR.second);
4168 void forceSingleThreadPerWorkgroupHelper(
Attributor &
A) {
4177 auto &Ctx = getAnchorValue().getContext();
4184 KernelInitCB->
getNextNode(),
"main.thread.user_code");
4189 A.registerManifestAddedBasicBlock(*InitBB);
4190 A.registerManifestAddedBasicBlock(*UserCodeBB);
4191 A.registerManifestAddedBasicBlock(*ReturnBB);
4200 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4202 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4203 M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
4208 OMPInfoCache.setCallingConvention(ThreadIdInBlockFn, ThreadIdInBlock);
4214 ConstantInt::get(ThreadIdInBlock->
getType(), 0),
4215 "thread.is_main", InitBB);
4221 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4223 if (!SPMDCompatibilityTracker.isAssumed()) {
4224 for (
Instruction *NonCompatibleI : SPMDCompatibilityTracker) {
4225 if (!NonCompatibleI)
4229 if (
auto *CB = dyn_cast<CallBase>(NonCompatibleI))
4230 if (OMPInfoCache.RTLFunctions.contains(CB->getCalledFunction()))
4234 ORA <<
"Value has potential side effects preventing SPMD-mode "
4236 if (isa<CallBase>(NonCompatibleI)) {
4237 ORA <<
". Add `[[omp::assume(\"ompx_spmd_amenable\")]]` to "
4238 "the called function to override";
4246 << *NonCompatibleI <<
"\n");
4258 Kernel = CB->getCaller();
4266 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC);
4272 Changed = ChangeStatus::CHANGED;
4276 if (mayContainParallelRegion())
4277 insertInstructionGuardsHelper(
A);
4279 forceSingleThreadPerWorkgroupHelper(
A);
4284 "Initially non-SPMD kernel has SPMD exec mode!");
4285 setExecModeOfKernelEnvironment(
4289 ++NumOpenMPTargetRegionKernelsSPMD;
4292 return OR <<
"Transformed generic-mode kernel to SPMD-mode.";
4304 if (!ReachedKnownParallelRegions.isValidState())
4307 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4308 if (!OMPInfoCache.runtimeFnsAvailable(
4309 {OMPRTL___kmpc_get_hardware_num_threads_in_block,
4310 OMPRTL___kmpc_get_warp_size, OMPRTL___kmpc_barrier_simple_generic,
4311 OMPRTL___kmpc_kernel_parallel, OMPRTL___kmpc_kernel_end_parallel}))
4322 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
4323 ExistingKernelEnvC);
4325 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC);
4330 if (UseStateMachineC->
isZero() ||
4334 Changed = ChangeStatus::CHANGED;
4337 setUseGenericStateMachineOfKernelEnvironment(
4344 if (!mayContainParallelRegion()) {
4345 ++NumOpenMPTargetRegionKernelsWithoutStateMachine;
4348 return OR <<
"Removing unused state machine from generic-mode kernel.";
4356 if (ReachedUnknownParallelRegions.empty()) {
4357 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback;
4360 return OR <<
"Rewriting generic-mode kernel with a customized state "
4365 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback;
4368 return OR <<
"Generic-mode kernel is executed with a customized state "
4369 "machine that requires a fallback.";
4374 for (
CallBase *UnknownParallelRegionCB : ReachedUnknownParallelRegions) {
4375 if (!UnknownParallelRegionCB)
4378 return ORA <<
"Call may contain unknown parallel regions. Use "
4379 <<
"`[[omp::assume(\"omp_no_parallelism\")]]` to "
4417 auto &Ctx = getAnchorValue().getContext();
4421 BasicBlock *InitBB = KernelInitCB->getParent();
4423 KernelInitCB->getNextNode(),
"thread.user_code.check");
4427 Ctx,
"worker_state_machine.begin",
Kernel, UserCodeEntryBB);
4429 Ctx,
"worker_state_machine.finished",
Kernel, UserCodeEntryBB);
4431 Ctx,
"worker_state_machine.is_active.check",
Kernel, UserCodeEntryBB);
4434 Kernel, UserCodeEntryBB);
4437 Kernel, UserCodeEntryBB);
4439 Ctx,
"worker_state_machine.done.barrier",
Kernel, UserCodeEntryBB);
4440 A.registerManifestAddedBasicBlock(*InitBB);
4441 A.registerManifestAddedBasicBlock(*UserCodeEntryBB);
4442 A.registerManifestAddedBasicBlock(*IsWorkerCheckBB);
4443 A.registerManifestAddedBasicBlock(*StateMachineBeginBB);
4444 A.registerManifestAddedBasicBlock(*StateMachineFinishedBB);
4445 A.registerManifestAddedBasicBlock(*StateMachineIsActiveCheckBB);
4446 A.registerManifestAddedBasicBlock(*StateMachineIfCascadeCurrentBB);
4447 A.registerManifestAddedBasicBlock(*StateMachineEndParallelBB);
4448 A.registerManifestAddedBasicBlock(*StateMachineDoneBarrierBB);
4450 const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
4456 ConstantInt::get(KernelInitCB->getType(), -1),
4457 "thread.is_worker", InitBB);
4463 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4464 M, OMPRTL___kmpc_get_hardware_num_threads_in_block);
4466 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4467 M, OMPRTL___kmpc_get_warp_size);
4470 OMPInfoCache.setCallingConvention(BlockHwSizeFn, BlockHwSize);
4474 OMPInfoCache.setCallingConvention(WarpSizeFn, WarpSize);
4477 BlockHwSize, WarpSize,
"block.size", IsWorkerCheckBB);
4481 "thread.is_main_or_worker", IsWorkerCheckBB);
4484 IsMainOrWorker, IsWorkerCheckBB);
4488 Type *VoidPtrTy = PointerType::getUnqual(Ctx);
4490 new AllocaInst(VoidPtrTy,
DL.getAllocaAddrSpace(),
nullptr,
4494 OMPInfoCache.OMPBuilder.updateToLocation(
4497 StateMachineBeginBB->
end()),
4500 Value *Ident = KernelInfo::getIdentFromKernelEnvironment(KernelEnvC);
4501 Value *GTid = KernelInitCB;
4504 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4505 M, OMPRTL___kmpc_barrier_simple_generic);
4508 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4512 (
unsigned int)AddressSpace::Generic) {
4514 WorkFnAI, PointerType::get(Ctx, (
unsigned int)AddressSpace::Generic),
4515 WorkFnAI->
getName() +
".generic", StateMachineBeginBB);
4520 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4521 M, OMPRTL___kmpc_kernel_parallel);
4523 KernelParallelFn, {WorkFnAI},
"worker.is_active", StateMachineBeginBB);
4524 OMPInfoCache.setCallingConvention(KernelParallelFn, IsActiveWorker);
4527 StateMachineBeginBB);
4537 StateMachineBeginBB);
4538 IsDone->setDebugLoc(DLoc);
4540 IsDone, StateMachineBeginBB)
4544 StateMachineDoneBarrierBB, IsActiveWorker,
4545 StateMachineIsActiveCheckBB)
4551 const unsigned int WrapperFunctionArgNo = 6;
4556 for (
int I = 0, E = ReachedKnownParallelRegions.size();
I < E; ++
I) {
4557 auto *CB = ReachedKnownParallelRegions[
I];
4558 auto *ParallelRegion = dyn_cast<Function>(
4559 CB->getArgOperand(WrapperFunctionArgNo)->stripPointerCasts());
4561 Ctx,
"worker_state_machine.parallel_region.execute",
Kernel,
4562 StateMachineEndParallelBB);
4564 ->setDebugLoc(DLoc);
4570 Kernel, StateMachineEndParallelBB);
4571 A.registerManifestAddedBasicBlock(*PRExecuteBB);
4572 A.registerManifestAddedBasicBlock(*PRNextBB);
4577 if (
I + 1 < E || !ReachedUnknownParallelRegions.empty()) {
4580 "worker.check_parallel_region", StateMachineIfCascadeCurrentBB);
4588 StateMachineIfCascadeCurrentBB)
4590 StateMachineIfCascadeCurrentBB = PRNextBB;
4596 if (!ReachedUnknownParallelRegions.empty()) {
4597 StateMachineIfCascadeCurrentBB->
setName(
4598 "worker_state_machine.parallel_region.fallback.execute");
4600 StateMachineIfCascadeCurrentBB)
4601 ->setDebugLoc(DLoc);
4604 StateMachineIfCascadeCurrentBB)
4608 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4609 M, OMPRTL___kmpc_kernel_end_parallel);
4612 OMPInfoCache.setCallingConvention(EndParallelFn, EndParallel);
4618 ->setDebugLoc(DLoc);
4628 KernelInfoState StateBefore = getState();
4634 struct UpdateKernelEnvCRAII {
4635 AAKernelInfoFunction &AA;
4637 UpdateKernelEnvCRAII(AAKernelInfoFunction &AA) : AA(AA) {}
4639 ~UpdateKernelEnvCRAII() {
4646 if (!AA.isValidState()) {
4647 AA.KernelEnvC = ExistingKernelEnvC;
4651 if (!AA.ReachedKnownParallelRegions.isValidState())
4652 AA.setUseGenericStateMachineOfKernelEnvironment(
4653 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
4654 ExistingKernelEnvC));
4656 if (!AA.SPMDCompatibilityTracker.isValidState())
4657 AA.setExecModeOfKernelEnvironment(
4658 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC));
4661 KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(
4663 ConstantInt *NewMayUseNestedParallelismC = ConstantInt::get(
4664 MayUseNestedParallelismC->
getIntegerType(), AA.NestedParallelism);
4665 AA.setMayUseNestedParallelismOfKernelEnvironment(
4666 NewMayUseNestedParallelismC);
4673 if (isa<CallBase>(
I))
4676 if (!
I.mayWriteToMemory())
4678 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
4681 DepClassTy::OPTIONAL);
4684 DepClassTy::OPTIONAL);
4685 if (UnderlyingObjsAA &&
4686 UnderlyingObjsAA->forallUnderlyingObjects([&](
Value &Obj) {
4687 if (AA::isAssumedThreadLocalObject(A, Obj, *this))
4691 auto *CB = dyn_cast<CallBase>(&Obj);
4692 return CB && HS && HS->isAssumedHeapToStack(*CB);
4698 SPMDCompatibilityTracker.insert(&
I);
4702 bool UsedAssumedInformationInCheckRWInst =
false;
4703 if (!SPMDCompatibilityTracker.isAtFixpoint())
4704 if (!
A.checkForAllReadWriteInstructions(
4705 CheckRWInst, *
this, UsedAssumedInformationInCheckRWInst))
4708 bool UsedAssumedInformationFromReachingKernels =
false;
4709 if (!IsKernelEntry) {
4710 updateParallelLevels(
A);
4712 bool AllReachingKernelsKnown =
true;
4713 updateReachingKernelEntries(
A, AllReachingKernelsKnown);
4714 UsedAssumedInformationFromReachingKernels = !AllReachingKernelsKnown;
4716 if (!SPMDCompatibilityTracker.empty()) {
4717 if (!ParallelLevels.isValidState())
4718 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4719 else if (!ReachingKernelEntries.isValidState())
4720 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4726 for (
auto *
Kernel : ReachingKernelEntries) {
4727 auto *CBAA =
A.getAAFor<AAKernelInfo>(
4729 if (CBAA && CBAA->SPMDCompatibilityTracker.isValidState() &&
4730 CBAA->SPMDCompatibilityTracker.isAssumed())
4734 if (!CBAA || !CBAA->SPMDCompatibilityTracker.isAtFixpoint())
4735 UsedAssumedInformationFromReachingKernels =
true;
4737 if (SPMD != 0 &&
Generic != 0)
4738 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4744 bool AllParallelRegionStatesWereFixed =
true;
4745 bool AllSPMDStatesWereFixed =
true;
4747 auto &CB = cast<CallBase>(
I);
4748 auto *CBAA =
A.getAAFor<AAKernelInfo>(
4752 getState() ^= CBAA->getState();
4753 AllSPMDStatesWereFixed &= CBAA->SPMDCompatibilityTracker.isAtFixpoint();
4754 AllParallelRegionStatesWereFixed &=
4755 CBAA->ReachedKnownParallelRegions.isAtFixpoint();
4756 AllParallelRegionStatesWereFixed &=
4757 CBAA->ReachedUnknownParallelRegions.isAtFixpoint();
4761 bool UsedAssumedInformationInCheckCallInst =
false;
4762 if (!
A.checkForAllCallLikeInstructions(
4763 CheckCallInst, *
this, UsedAssumedInformationInCheckCallInst)) {
4765 <<
"Failed to visit all call-like instructions!\n";);
4766 return indicatePessimisticFixpoint();
4771 if (!UsedAssumedInformationInCheckCallInst &&
4772 AllParallelRegionStatesWereFixed) {
4773 ReachedKnownParallelRegions.indicateOptimisticFixpoint();
4774 ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
4779 if (!UsedAssumedInformationInCheckRWInst &&
4780 !UsedAssumedInformationInCheckCallInst &&
4781 !UsedAssumedInformationFromReachingKernels && AllSPMDStatesWereFixed)
4782 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
4784 return StateBefore == getState() ? ChangeStatus::UNCHANGED
4785 : ChangeStatus::CHANGED;
4791 bool &AllReachingKernelsKnown) {
4795 assert(Caller &&
"Caller is nullptr");
4797 auto *CAA =
A.getOrCreateAAFor<AAKernelInfo>(
4799 if (CAA && CAA->ReachingKernelEntries.isValidState()) {
4800 ReachingKernelEntries ^= CAA->ReachingKernelEntries;
4806 ReachingKernelEntries.indicatePessimisticFixpoint();
4811 if (!
A.checkForAllCallSites(PredCallSite, *
this,
4813 AllReachingKernelsKnown))
4814 ReachingKernelEntries.indicatePessimisticFixpoint();
4819 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4820 OMPInformationCache::RuntimeFunctionInfo &Parallel51RFI =
4821 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
4826 assert(Caller &&
"Caller is nullptr");
4830 if (CAA && CAA->ParallelLevels.isValidState()) {
4836 if (Caller == Parallel51RFI.Declaration) {
4837 ParallelLevels.indicatePessimisticFixpoint();
4841 ParallelLevels ^= CAA->ParallelLevels;
4848 ParallelLevels.indicatePessimisticFixpoint();
4853 bool AllCallSitesKnown =
true;
4854 if (!
A.checkForAllCallSites(PredCallSite, *
this,
4857 ParallelLevels.indicatePessimisticFixpoint();
4864struct AAKernelInfoCallSite : AAKernelInfo {
4866 : AAKernelInfo(IRP,
A) {}
4870 AAKernelInfo::initialize(
A);
4872 CallBase &CB = cast<CallBase>(getAssociatedValue());
4877 if (AssumptionAA && AssumptionAA->hasAssumption(
"ompx_spmd_amenable")) {
4878 indicateOptimisticFixpoint();
4886 indicateOptimisticFixpoint();
4895 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4896 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
4897 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
4899 if (!Callee || !
A.isFunctionIPOAmendable(*Callee)) {
4903 if (!AssumptionAA ||
4904 !(AssumptionAA->hasAssumption(
"omp_no_openmp") ||
4905 AssumptionAA->hasAssumption(
"omp_no_parallelism")))
4906 ReachedUnknownParallelRegions.insert(&CB);
4910 if (!SPMDCompatibilityTracker.isAtFixpoint()) {
4911 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4912 SPMDCompatibilityTracker.insert(&CB);
4917 indicateOptimisticFixpoint();
4923 if (NumCallees > 1) {
4924 indicatePessimisticFixpoint();
4931 case OMPRTL___kmpc_is_spmd_exec_mode:
4932 case OMPRTL___kmpc_distribute_static_fini:
4933 case OMPRTL___kmpc_for_static_fini:
4934 case OMPRTL___kmpc_global_thread_num:
4935 case OMPRTL___kmpc_get_hardware_num_threads_in_block:
4936 case OMPRTL___kmpc_get_hardware_num_blocks:
4937 case OMPRTL___kmpc_single:
4938 case OMPRTL___kmpc_end_single:
4939 case OMPRTL___kmpc_master:
4940 case OMPRTL___kmpc_end_master:
4941 case OMPRTL___kmpc_barrier:
4942 case OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2:
4943 case OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2:
4944 case OMPRTL___kmpc_error:
4945 case OMPRTL___kmpc_flush:
4946 case OMPRTL___kmpc_get_hardware_thread_id_in_block:
4947 case OMPRTL___kmpc_get_warp_size:
4948 case OMPRTL_omp_get_thread_num:
4949 case OMPRTL_omp_get_num_threads:
4950 case OMPRTL_omp_get_max_threads:
4951 case OMPRTL_omp_in_parallel:
4952 case OMPRTL_omp_get_dynamic:
4953 case OMPRTL_omp_get_cancellation:
4954 case OMPRTL_omp_get_nested:
4955 case OMPRTL_omp_get_schedule:
4956 case OMPRTL_omp_get_thread_limit:
4957 case OMPRTL_omp_get_supported_active_levels:
4958 case OMPRTL_omp_get_max_active_levels:
4959 case OMPRTL_omp_get_level:
4960 case OMPRTL_omp_get_ancestor_thread_num:
4961 case OMPRTL_omp_get_team_size:
4962 case OMPRTL_omp_get_active_level:
4963 case OMPRTL_omp_in_final:
4964 case OMPRTL_omp_get_proc_bind:
4965 case OMPRTL_omp_get_num_places:
4966 case OMPRTL_omp_get_num_procs:
4967 case OMPRTL_omp_get_place_proc_ids:
4968 case OMPRTL_omp_get_place_num:
4969 case OMPRTL_omp_get_partition_num_places:
4970 case OMPRTL_omp_get_partition_place_nums:
4971 case OMPRTL_omp_get_wtime:
4973 case OMPRTL___kmpc_distribute_static_init_4:
4974 case OMPRTL___kmpc_distribute_static_init_4u:
4975 case OMPRTL___kmpc_distribute_static_init_8:
4976 case OMPRTL___kmpc_distribute_static_init_8u:
4977 case OMPRTL___kmpc_for_static_init_4:
4978 case OMPRTL___kmpc_for_static_init_4u:
4979 case OMPRTL___kmpc_for_static_init_8:
4980 case OMPRTL___kmpc_for_static_init_8u: {
4982 unsigned ScheduleArgOpNo = 2;
4983 auto *ScheduleTypeCI =
4985 unsigned ScheduleTypeVal =
4986 ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0;
4988 case OMPScheduleType::UnorderedStatic:
4989 case OMPScheduleType::UnorderedStaticChunked:
4990 case OMPScheduleType::OrderedDistribute:
4991 case OMPScheduleType::OrderedDistributeChunked:
4994 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4995 SPMDCompatibilityTracker.insert(&CB);
4999 case OMPRTL___kmpc_target_init:
5002 case OMPRTL___kmpc_target_deinit:
5003 KernelDeinitCB = &CB;
5005 case OMPRTL___kmpc_parallel_51:
5006 if (!handleParallel51(
A, CB))
5007 indicatePessimisticFixpoint();
5009 case OMPRTL___kmpc_omp_task:
5011 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5012 SPMDCompatibilityTracker.insert(&CB);
5013 ReachedUnknownParallelRegions.insert(&CB);
5015 case OMPRTL___kmpc_alloc_shared:
5016 case OMPRTL___kmpc_free_shared:
5022 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5023 SPMDCompatibilityTracker.insert(&CB);
5029 indicateOptimisticFixpoint();
5033 A.getAAFor<
AACallEdges>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
5034 if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) {
5035 CheckCallee(getAssociatedFunction(), 1);
5038 const auto &OptimisticEdges = AACE->getOptimisticEdges();
5039 for (
auto *Callee : OptimisticEdges) {
5040 CheckCallee(Callee, OptimisticEdges.size());
5051 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
5052 KernelInfoState StateBefore = getState();
5054 auto CheckCallee = [&](
Function *
F,
int NumCallees) {
5055 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(
F);
5059 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
5062 A.getAAFor<AAKernelInfo>(*
this, FnPos, DepClassTy::REQUIRED);
5064 return indicatePessimisticFixpoint();
5065 if (getState() == FnAA->getState())
5066 return ChangeStatus::UNCHANGED;
5067 getState() = FnAA->getState();
5068 return ChangeStatus::CHANGED;
5071 return indicatePessimisticFixpoint();
5073 CallBase &CB = cast<CallBase>(getAssociatedValue());
5074 if (It->getSecond() == OMPRTL___kmpc_parallel_51) {
5075 if (!handleParallel51(
A, CB))
5076 return indicatePessimisticFixpoint();
5077 return StateBefore == getState() ? ChangeStatus::UNCHANGED
5078 : ChangeStatus::CHANGED;
5084 (It->getSecond() == OMPRTL___kmpc_alloc_shared ||
5085 It->getSecond() == OMPRTL___kmpc_free_shared) &&
5086 "Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call");
5090 auto *HeapToSharedAA =
A.getAAFor<AAHeapToShared>(
5098 case OMPRTL___kmpc_alloc_shared:
5099 if ((!HeapToStackAA || !HeapToStackAA->isAssumedHeapToStack(CB)) &&
5100 (!HeapToSharedAA || !HeapToSharedAA->isAssumedHeapToShared(CB)))
5101 SPMDCompatibilityTracker.insert(&CB);
5103 case OMPRTL___kmpc_free_shared:
5104 if ((!HeapToStackAA ||
5105 !HeapToStackAA->isAssumedHeapToStackRemovedFree(CB)) &&
5107 !HeapToSharedAA->isAssumedHeapToSharedRemovedFree(CB)))
5108 SPMDCompatibilityTracker.insert(&CB);
5111 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5112 SPMDCompatibilityTracker.insert(&CB);
5114 return ChangeStatus::CHANGED;
5118 A.getAAFor<
AACallEdges>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
5119 if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) {
5120 if (
Function *
F = getAssociatedFunction())
5124 for (
auto *Callee : OptimisticEdges) {
5125 CheckCallee(Callee, OptimisticEdges.size());
5131 return StateBefore == getState() ? ChangeStatus::UNCHANGED
5132 : ChangeStatus::CHANGED;
5138 const unsigned int NonWrapperFunctionArgNo = 5;
5139 const unsigned int WrapperFunctionArgNo = 6;
5140 auto ParallelRegionOpArgNo = SPMDCompatibilityTracker.isAssumed()
5141 ? NonWrapperFunctionArgNo
5142 : WrapperFunctionArgNo;
5144 auto *ParallelRegion = dyn_cast<Function>(
5146 if (!ParallelRegion)
5149 ReachedKnownParallelRegions.insert(&CB);
5151 auto *FnAA =
A.getAAFor<AAKernelInfo>(
5153 NestedParallelism |= !FnAA || !FnAA->getState().isValidState() ||
5154 !FnAA->ReachedKnownParallelRegions.empty() ||
5155 !FnAA->ReachedKnownParallelRegions.isValidState() ||
5156 !FnAA->ReachedUnknownParallelRegions.isValidState() ||
5157 !FnAA->ReachedUnknownParallelRegions.empty();
5162struct AAFoldRuntimeCall
5163 :
public StateWrapper<BooleanState, AbstractAttribute> {
5169 void trackStatistics()
const override {}
5172 static AAFoldRuntimeCall &createForPosition(
const IRPosition &IRP,
5176 const std::string
getName()
const override {
return "AAFoldRuntimeCall"; }
5179 const char *getIdAddr()
const override {
return &
ID; }
5187 static const char ID;
5190struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
5192 : AAFoldRuntimeCall(IRP,
A) {}
5195 const std::string getAsStr(
Attributor *)
const override {
5196 if (!isValidState())
5199 std::string Str(
"simplified value: ");
5201 if (!SimplifiedValue)
5202 return Str + std::string(
"none");
5204 if (!*SimplifiedValue)
5205 return Str + std::string(
"nullptr");
5207 if (
ConstantInt *CI = dyn_cast<ConstantInt>(*SimplifiedValue))
5208 return Str + std::to_string(CI->getSExtValue());
5210 return Str + std::string(
"unknown");
5215 indicatePessimisticFixpoint();
5219 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
5220 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
5221 assert(It != OMPInfoCache.RuntimeFunctionIDMap.end() &&
5222 "Expected a known OpenMP runtime function");
5224 RFKind = It->getSecond();
5226 CallBase &CB = cast<CallBase>(getAssociatedValue());
5227 A.registerSimplificationCallback(
5230 bool &UsedAssumedInformation) -> std::optional<Value *> {
5231 assert((isValidState() ||
5232 (SimplifiedValue && *SimplifiedValue ==
nullptr)) &&
5233 "Unexpected invalid state!");
5235 if (!isAtFixpoint()) {
5236 UsedAssumedInformation =
true;
5238 A.recordDependence(*
this, *AA, DepClassTy::OPTIONAL);
5240 return SimplifiedValue;
5247 case OMPRTL___kmpc_is_spmd_exec_mode:
5248 Changed |= foldIsSPMDExecMode(
A);
5250 case OMPRTL___kmpc_parallel_level:
5251 Changed |= foldParallelLevel(
A);
5253 case OMPRTL___kmpc_get_hardware_num_threads_in_block:
5254 Changed = Changed | foldKernelFnAttribute(
A,
"omp_target_thread_limit");
5256 case OMPRTL___kmpc_get_hardware_num_blocks:
5257 Changed = Changed | foldKernelFnAttribute(
A,
"omp_target_num_teams");
5269 if (SimplifiedValue && *SimplifiedValue) {
5272 A.deleteAfterManifest(
I);
5276 if (
auto *
C = dyn_cast<ConstantInt>(*SimplifiedValue))
5277 return OR <<
"Replacing OpenMP runtime call "
5279 <<
ore::NV(
"FoldedValue",
C->getZExtValue()) <<
".";
5280 return OR <<
"Replacing OpenMP runtime call "
5288 << **SimplifiedValue <<
"\n");
5290 Changed = ChangeStatus::CHANGED;
5297 SimplifiedValue =
nullptr;
5298 return AAFoldRuntimeCall::indicatePessimisticFixpoint();
5304 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5306 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
5307 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
5308 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5311 if (!CallerKernelInfoAA ||
5312 !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5313 return indicatePessimisticFixpoint();
5315 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5317 DepClassTy::REQUIRED);
5319 if (!AA || !AA->isValidState()) {
5320 SimplifiedValue =
nullptr;
5321 return indicatePessimisticFixpoint();
5324 if (AA->SPMDCompatibilityTracker.isAssumed()) {
5325 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5330 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5331 ++KnownNonSPMDCount;
5333 ++AssumedNonSPMDCount;
5337 if ((AssumedSPMDCount + KnownSPMDCount) &&
5338 (AssumedNonSPMDCount + KnownNonSPMDCount))
5339 return indicatePessimisticFixpoint();
5341 auto &Ctx = getAnchorValue().getContext();
5342 if (KnownSPMDCount || AssumedSPMDCount) {
5343 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
5344 "Expected only SPMD kernels!");
5348 }
else if (KnownNonSPMDCount || AssumedNonSPMDCount) {
5349 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
5350 "Expected only non-SPMD kernels!");
5358 assert(!SimplifiedValue &&
"SimplifiedValue should be none");
5361 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5362 : ChangeStatus::CHANGED;
5367 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5369 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5372 if (!CallerKernelInfoAA ||
5373 !CallerKernelInfoAA->ParallelLevels.isValidState())
5374 return indicatePessimisticFixpoint();
5376 if (!CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5377 return indicatePessimisticFixpoint();
5379 if (CallerKernelInfoAA->ReachingKernelEntries.empty()) {
5380 assert(!SimplifiedValue &&
5381 "SimplifiedValue should keep none at this point");
5382 return ChangeStatus::UNCHANGED;
5385 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
5386 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
5387 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5389 DepClassTy::REQUIRED);
5390 if (!AA || !AA->SPMDCompatibilityTracker.isValidState())
5391 return indicatePessimisticFixpoint();
5393 if (AA->SPMDCompatibilityTracker.isAssumed()) {
5394 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5399 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5400 ++KnownNonSPMDCount;
5402 ++AssumedNonSPMDCount;
5406 if ((AssumedSPMDCount + KnownSPMDCount) &&
5407 (AssumedNonSPMDCount + KnownNonSPMDCount))
5408 return indicatePessimisticFixpoint();
5410 auto &Ctx = getAnchorValue().getContext();
5414 if (AssumedSPMDCount || KnownSPMDCount) {
5415 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
5416 "Expected only SPMD kernels!");
5419 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
5420 "Expected only non-SPMD kernels!");
5423 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5424 : ChangeStatus::CHANGED;
5429 int32_t CurrentAttrValue = -1;
5430 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5432 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5435 if (!CallerKernelInfoAA ||
5436 !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5437 return indicatePessimisticFixpoint();
5440 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5441 int32_t NextAttrVal =
K->getFnAttributeAsParsedInteger(Attr, -1);
5443 if (NextAttrVal == -1 ||
5444 (CurrentAttrValue != -1 && CurrentAttrValue != NextAttrVal))
5445 return indicatePessimisticFixpoint();
5446 CurrentAttrValue = NextAttrVal;
5449 if (CurrentAttrValue != -1) {
5450 auto &Ctx = getAnchorValue().getContext();
5454 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5455 : ChangeStatus::CHANGED;
5461 std::optional<Value *> SimplifiedValue;
5471 auto &RFI = OMPInfoCache.RFIs[RF];
5473 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &RFI);
5476 A.getOrCreateAAFor<AAFoldRuntimeCall>(
5478 DepClassTy::NONE,
false,
5484void OpenMPOpt::registerAAs(
bool IsModulePass) {
5494 A.getOrCreateAAFor<AAKernelInfo>(
5496 DepClassTy::NONE,
false,
5500 OMPInformationCache::RuntimeFunctionInfo &InitRFI =
5501 OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
5502 InitRFI.foreachUse(SCC, CreateKernelInfoCB);
5504 registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode);
5505 registerFoldRuntimeCall(OMPRTL___kmpc_parallel_level);
5506 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_threads_in_block);
5507 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_blocks);
5512 for (
int Idx = 0;
Idx < OMPInfoCache.ICVs.size() - 1; ++
Idx) {
5515 auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
5518 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
5522 auto &CB = cast<CallBase>(*CI);
5525 A.getOrCreateAAFor<AAICVTracker>(CBPos);
5529 GetterRFI.foreachUse(SCC, CreateAA);
5538 for (
auto *
F : SCC) {
5539 if (
F->isDeclaration())
5545 if (
F->hasLocalLinkage()) {
5547 const auto *CB = dyn_cast<CallBase>(U.getUser());
5548 return CB && CB->isCallee(&U) &&
5549 A.isRunOn(const_cast<Function *>(CB->getCaller()));
5553 registerAAsForFunction(
A, *
F);
5563 if (
F.hasFnAttribute(Attribute::Convergent))
5567 if (
auto *LI = dyn_cast<LoadInst>(&
I)) {
5568 bool UsedAssumedInformation =
false;
5575 if (
auto *CI = dyn_cast<CallBase>(&
I)) {
5580 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
5586 if (
auto *FI = dyn_cast<FenceInst>(&
I)) {
5590 if (
auto *
II = dyn_cast<IntrinsicInst>(&
I)) {
5591 if (
II->getIntrinsicID() == Intrinsic::assume) {
5600const char AAICVTracker::ID = 0;
5601const char AAKernelInfo::ID = 0;
5603const char AAHeapToShared::ID = 0;
5604const char AAFoldRuntimeCall::ID = 0;
5606AAICVTracker &AAICVTracker::createForPosition(
const IRPosition &IRP,
5608 AAICVTracker *AA =
nullptr;
5616 AA =
new (
A.Allocator) AAICVTrackerFunctionReturned(IRP,
A);
5619 AA =
new (
A.Allocator) AAICVTrackerCallSiteReturned(IRP,
A);
5622 AA =
new (
A.Allocator) AAICVTrackerCallSite(IRP,
A);
5625 AA =
new (
A.Allocator) AAICVTrackerFunction(IRP,
A);
5634 AAExecutionDomainFunction *AA =
nullptr;
5644 "AAExecutionDomain can only be created for function position!");
5646 AA =
new (
A.Allocator) AAExecutionDomainFunction(IRP,
A);
5653AAHeapToShared &AAHeapToShared::createForPosition(
const IRPosition &IRP,
5655 AAHeapToSharedFunction *AA =
nullptr;
5665 "AAHeapToShared can only be created for function position!");
5667 AA =
new (
A.Allocator) AAHeapToSharedFunction(IRP,
A);
5674AAKernelInfo &AAKernelInfo::createForPosition(
const IRPosition &IRP,
5676 AAKernelInfo *AA =
nullptr;
5686 AA =
new (
A.Allocator) AAKernelInfoCallSite(IRP,
A);
5689 AA =
new (
A.Allocator) AAKernelInfoFunction(IRP,
A);
5696AAFoldRuntimeCall &AAFoldRuntimeCall::createForPosition(
const IRPosition &IRP,
5698 AAFoldRuntimeCall *AA =
nullptr;
5707 llvm_unreachable(
"KernelInfo can only be created for call site position!");
5709 AA =
new (
A.Allocator) AAFoldRuntimeCallCallSiteReturned(IRP,
A);
5730 if (Kernels.contains(&
F))
5732 for (
const User *U :
F.users())
5733 if (!isa<BlockAddress>(U))
5742 return ORA <<
"Could not internalize function. "
5743 <<
"Some optimizations may not be possible. [OMP140]";
5747 bool Changed =
false;
5755 if (!
F.isDeclaration() && !Kernels.contains(&
F) && IsCalled(
F) &&
5759 }
else if (!
F.hasLocalLinkage() && !
F.hasFnAttribute(Attribute::Cold)) {
5772 if (!
F.isDeclaration() && !InternalizedMap.
lookup(&
F)) {
5791 OMPInformationCache InfoCache(M, AG,
Allocator,
nullptr, PostLink);
5793 unsigned MaxFixpointIterations =
5805 return F.hasFnAttribute(
"kernel");
5810 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache,
A);
5811 Changed |= OMPOpt.run(
true);
5816 if (!
F.isDeclaration() && !Kernels.contains(&
F) &&
5817 !
F.hasFnAttribute(Attribute::NoInline))
5818 F.addFnAttr(Attribute::AlwaysInline);
5848 Module &M = *
C.begin()->getFunction().getParent();
5871 OMPInformationCache InfoCache(*(Functions.
back()->getParent()), AG,
Allocator,
5872 &Functions, PostLink);
5874 unsigned MaxFixpointIterations =
5888 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache,
A);
5889 bool Changed = OMPOpt.run(
false);
5906 NamedMDNode *MD = M.getNamedMetadata(
"nvvm.annotations");
5915 MDString *KindID = dyn_cast<MDString>(
Op->getOperand(1));
5916 if (!KindID || KindID->
getString() !=
"kernel")
5920 mdconst::dyn_extract_or_null<Function>(
Op->getOperand(0));
5927 ++NumOpenMPTargetRegionKernels;
5928 Kernels.insert(KernelFn);
5930 ++NumNonOpenMPTargetRegionKernels;
5937 Metadata *MD = M.getModuleFlag(
"openmp");
5945 Metadata *MD = M.getModuleFlag(
"openmp-device");
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Expand Atomic instructions
static cl::opt< unsigned > SetFixpointIterations("attributor-max-iterations", cl::Hidden, cl::desc("Maximal number of fixpoint iterations."), cl::init(32))
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
This file provides interfaces used to manipulate a call graph, regardless if it is a "old style" Call...
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
dxil pretty DXIL Metadata Pretty Printer
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines an array type that can be indexed using scoped enum values.
static void emitRemark(const Function &F, OptimizationRemarkEmitter &ORE, bool Skip)
static bool lookup(const GsymReader &GR, DataExtractor &Data, uint64_t &Offset, uint64_t BaseAddr, uint64_t Addr, SourceLocations &SrcLocs, llvm::Error &Err)
A Lookup helper functions.
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
This file defines constans and helpers used when dealing with OpenMP.
This file defines constans that will be used by both host and device compilation.
static constexpr auto TAG
static cl::opt< bool > HideMemoryTransferLatency("openmp-hide-memory-transfer-latency", cl::desc("[WIP] Tries to hide the latency of host to device memory" " transfers"), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptStateMachineRewrite("openmp-opt-disable-state-machine-rewrite", cl::desc("Disable OpenMP optimizations that replace the state machine."), cl::Hidden, cl::init(false))
static cl::opt< bool > EnableParallelRegionMerging("openmp-opt-enable-merging", cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden, cl::init(false))
static cl::opt< bool > AlwaysInlineDeviceFunctions("openmp-opt-inline-device", cl::desc("Inline all applicible functions on the device."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintModuleAfterOptimizations("openmp-opt-print-module-after", cl::desc("Print the current module after OpenMP optimizations."), cl::Hidden, cl::init(false))
#define KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MEMBER)
#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX)
#define KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MEMBER)
static cl::opt< bool > PrintOpenMPKernels("openmp-print-gpu-kernels", cl::init(false), cl::Hidden)
static cl::opt< bool > DisableOpenMPOptFolding("openmp-opt-disable-folding", cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintModuleBeforeOptimizations("openmp-opt-print-module-before", cl::desc("Print the current module before OpenMP optimizations."), cl::Hidden, cl::init(false))
static cl::opt< unsigned > SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden, cl::desc("Maximal number of attributor iterations."), cl::init(256))
static cl::opt< bool > DisableInternalization("openmp-opt-disable-internalization", cl::desc("Disable function internalization."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintICVValues("openmp-print-icv-values", cl::init(false), cl::Hidden)
static cl::opt< bool > DisableOpenMPOptimizations("openmp-opt-disable", cl::desc("Disable OpenMP specific optimizations."), cl::Hidden, cl::init(false))
static cl::opt< unsigned > SharedMemoryLimit("openmp-opt-shared-limit", cl::Hidden, cl::desc("Maximum amount of shared memory to use."), cl::init(std::numeric_limits< unsigned >::max()))
static cl::opt< bool > EnableVerboseRemarks("openmp-opt-verbose-remarks", cl::desc("Enables more verbose remarks."), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptDeglobalization("openmp-opt-disable-deglobalization", cl::desc("Disable OpenMP optimizations involving deglobalization."), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptBarrierElimination("openmp-opt-disable-barrier-elimination", cl::desc("Disable OpenMP optimizations that eliminate barriers."), cl::Hidden, cl::init(false))
static cl::opt< bool > DeduceICVValues("openmp-deduce-icv-values", cl::init(false), cl::Hidden)
#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX)
#define KERNEL_ENVIRONMENT_GETTER(MEMBER, RETURNTYPE)
static cl::opt< bool > DisableOpenMPOptSPMDization("openmp-opt-disable-spmdization", cl::desc("Disable OpenMP optimizations involving SPMD-ization."), cl::Hidden, cl::init(false))
FunctionAnalysisManager FAM
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static StringRef getName(Value *V)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static const int BlockSize
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, ArrayRef< StringLiteral > StandardNames)
Initialize the set of available library functions based on the specified target triple.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
This class represents an incoming formal argument to a Function.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
AttributeSet getParamAttrs(unsigned ArgNo) const
The attributes for the argument or parameter at the given index are returned.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
reverse_iterator rbegin()
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Conditional or Unconditional Branch instruction.
bool isConditional() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Allocate memory in an ever growing pool, as if by bump-pointer.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
void setCallingConv(CallingConv::ID CC)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool doesNotAccessMemory(unsigned OpNo) const
bool isIndirectCall() const
Return true if the callsite is an indirect call.
bool isCallee(Value::const_user_iterator UI) const
Determine whether the passed iterator points to the callee operand's Use.
Value * getArgOperand(unsigned i) const
void setArgOperand(unsigned i, Value *v)
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
unsigned arg_size() const
AttributeList getAttributes() const
Return the parameter attributes for this call.
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool isArgOperand(const Use *U) const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Function * getCaller()
Helper to get the caller (the parent function).
Wrapper to unify "old style" CallGraph and "new style" LazyCallGraph.
void initialize(LazyCallGraph &LCG, LazyCallGraph::SCC &SCC, CGSCCAnalysisManager &AM, CGSCCUpdateResult &UR)
Initializers for usage outside of a CGSCC pass, inside a CGSCC pass in the old and new pass manager (...
This class represents a function call, abstracting a target machine's calling convention.
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
@ ICMP_SLT
signed less than
static Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
This is the shared class of boolean and integer constants.
IntegerType * getIntegerType() const
Variant of the getType() method to always return an IntegerType, which reduces the amount of casting ...
static ConstantInt * getTrue(LLVMContext &Context)
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
This is an important base class in LLVM.
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
Analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const
Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...
An instruction for ordering other memory operations.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this fence instruction.
A proxy from a FunctionAnalysisManager to an SCC.
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
const BasicBlock & getEntryBlock() const
const BasicBlock & front() const
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Argument * getArg(unsigned i) const
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
bool hasLocalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
PointerType * getType() const
Global values are always pointers.
@ PrivateLinkage
Like Internal, but omit from symbol table.
@ InternalLinkage
Rename collisions when linking (static functions).
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
void setInitializer(Constant *InitVal)
setInitializer - Sets the initializer for this global variable, removing any existing initializer if ...
void eraseFromParent()
eraseFromParent - This method unlinks 'this' from the containing module and deletes it.
InsertPoint - A saved insertion point.
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
bool isLifetimeStartOrEnd() const LLVM_READONLY
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
const Instruction * getPrevNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the previous non-debug instruction in the same basic block as 'this',...
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
bool mayReadFromMemory() const LLVM_READONLY
Return true if this instruction may read memory.
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
void setSuccessor(unsigned Idx, BasicBlock *BB)
Update the specified successor to point at the provided block.
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
A node in the call graph.
An SCC of the call graph.
A lazily constructed view of the call graph of a module.
An instruction for reading from memory.
StringRef getString() const
void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
A Module instance is used to store all the information related to an LLVM module.
const std::string & getTargetTriple() const
Get the target triple which is a string describing the target host.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
iterator_range< op_iterator > operands()
An interface to create LLVM-IR for OpenMP directives.
static std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
static std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM, LazyCallGraph &CG, CGSCCUpdateResult &UR)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
static ReturnInst * Create(LLVMContext &C, Value *retVal=nullptr, InsertPosition InsertBefore=nullptr)
A vector that has set insertion semantics.
size_type size() const
Determine the number of elements in the SetVector.
const value_type & back() const
Return the last element of the SetVector.
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Triple - Helper class for working with autoconf configuration names.
The instances of the Type class are immutable: once they are created, they are never changed.
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static Type * getVoidTy(LLVMContext &C)
static IntegerType * getInt16Ty(LLVMContext &C)
static IntegerType * getInt8Ty(LLVMContext &C)
static IntegerType * getInt32Ty(LLVMContext &C)
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
void setName(const Twine &Name)
Change the name of the value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
unsigned getNumUses() const
This method computes the number of uses of this Value.
StringRef getName() const
Return a constant reference to the value's name.
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
A raw_ostream that writes to an std::string.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
GlobalVariable * getKernelEnvironementGVFromKernelInitCB(CallBase *KernelInitCB)
ConstantStruct * getKernelEnvironementFromKernelInitCB(CallBase *KernelInitCB)
bool isValidAtPosition(const ValueAndContext &VAC, InformationCache &InfoCache)
Return true if the value of VAC is a valid at the position of VAC, that is a constant,...
bool isPotentiallyAffectedByBarrier(Attributor &A, const Instruction &I, const AbstractAttribute &QueryingAA)
Return true if I is potentially affected by a barrier.
bool isNoSyncInst(Attributor &A, const Instruction &I, const AbstractAttribute &QueryingAA)
Return true if I is a nosync instruction.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
E & operator^=(E &LHS, E RHS)
@ C
The default llvm calling convention, compatible with C.
initializer< Ty > init(const Ty &Val)
std::optional< const char * > toString(const std::optional< DWARFFormValue > &V)
Take an optional DWARFFormValue and try to extract a string value from it.
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
constexpr uint64_t PointerSize
aarch64 pointer size.
bool isOpenMPDevice(Module &M)
Helper to determine if M is a OpenMP target offloading device module.
bool containsOpenMP(Module &M)
Helper to determine if M contains OpenMP.
InternalControlVar
IDs for all Internal Control Variables (ICVs).
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
KernelSet getDeviceKernels(Module &M)
Get OpenMP device kernels in M.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
@ OMP_TGT_EXEC_MODE_GENERIC_SPMD
@ OMP_TGT_EXEC_MODE_GENERIC
bool isOpenMPKernel(Function &Fn)
Return true iff Fn is an OpenMP GPU kernel; Fn has the "kernel" attribute.
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
const_iterator end(StringRef path)
Get end iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool succ_empty(const Instruction *I)
std::string to_string(const T &Value)
bool operator!=(uint64_t V1, const APInt &V2)
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
@ FullLTOPostLink
Full LTO postlink (backend compile) phase.
@ ThinLTOPreLink
ThinLTO prelink (summary) phase.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
Implement std::hash so that hash_code can be used in STL containers.
An abstract interface for address space information.
An abstract attribute for getting assumption information.
An abstract state for querying live call edges.
virtual const SetVector< Function * > & getOptimisticEdges() const =0
Get the optimistic edges.
bool IsReachedFromAlignedBarrierOnly
bool isExecutedByInitialThreadOnly(const Instruction &I) const
Check if an instruction is executed only by the initial thread.
static AAExecutionDomain & createForPosition(const IRPosition &IRP, Attributor &A)
Create an abstract attribute view for the position IRP.
virtual ExecutionDomainTy getFunctionExecutionDomain() const =0
virtual ExecutionDomainTy getExecutionDomain(const BasicBlock &) const =0
virtual bool isExecutedInAlignedRegion(Attributor &A, const Instruction &I) const =0
Check if the instruction I is executed in an aligned region, that is, the synchronizing effects befor...
virtual bool isNoOpFence(const FenceInst &FI) const =0
Helper function to determine if FI is a no-op given the information about its execution from ExecDoma...
static const char ID
Unique ID (due to the unique address)
An abstract interface for indirect call information interference.
An abstract interface for liveness abstract attribute.
An abstract interface for all memory location attributes (readnone/argmemonly/inaccessiblememonly/ina...
AccessKind
Simple enum to distinguish read/write/read-write accesses.
StateType::base_t MemoryLocationsKind
static bool isAlignedBarrier(const CallBase &CB, bool ExecutedAligned)
Helper function to determine if CB is an aligned (GPU) barrier.
An abstract Attribute for determining the necessity of the convergent attribute.
An abstract attribute for getting all assumption underlying objects.
Base struct for all "concrete attribute" deductions.
virtual ChangeStatus manifest(Attributor &A)
Hook for the Attributor to trigger the manifestation of the information represented by the abstract a...
virtual void initialize(Attributor &A)
Initialize the state with the information in the Attributor A.
virtual const std::string getAsStr(Attributor *A) const =0
This function should return the "summarized" assumed state as string.
virtual ChangeStatus updateImpl(Attributor &A)=0
The actual update/transfer function which has to be implemented by the derived classes.
virtual void trackStatistics() const =0
Hook to enable custom statistic tracking, called after manifest that resulted in a change if statisti...
virtual const char * getIdAddr() const =0
This function should return the address of the ID of the AbstractAttribute.
An interface to query the internal state of an abstract attribute.
virtual ChangeStatus indicatePessimisticFixpoint()=0
Indicate that the abstract state should converge to the pessimistic state.
virtual bool isAtFixpoint() const =0
Return if this abstract state is fixed, thus does not need to be updated if information changes as it...
virtual bool isValidState() const =0
Return if this abstract state is in a valid state.
virtual ChangeStatus indicateOptimisticFixpoint()=0
Indicate that the abstract state should converge to the optimistic state.
Wrapper for FunctionAnalysisManager.
Configuration for the Attributor.
std::function< void(Attributor &A, const Function &F)> InitializationCallback
Callback function to be invoked on internal functions marked live.
std::optional< unsigned > MaxFixpointIterations
Maximum number of iterations to run until fixpoint.
bool RewriteSignatures
Flag to determine if we rewrite function signatures.
OptimizationRemarkGetter OREGetter
IPOAmendableCBTy IPOAmendableCB
bool IsModulePass
Is the user of the Attributor a module pass or not.
bool DefaultInitializeLiveInternals
Flag to determine if we want to initialize all default AAs for an internal function marked live.
The fixpoint analysis framework that orchestrates the attribute deduction.
static bool isInternalizable(Function &F)
Returns true if the function F can be internalized.
std::function< std::optional< Constant * >(const GlobalVariable &, const AbstractAttribute *, bool &)> GlobalVariableSimplifictionCallbackTy
Register CB as a simplification callback.
std::function< bool(Attributor &, const AbstractAttribute *)> VirtualUseCallbackTy
static bool internalizeFunctions(SmallPtrSetImpl< Function * > &FnSet, DenseMap< Function *, Function * > &FnMap)
Make copies of each function in the set FnSet such that the copied version has internal linkage after...
std::function< std::optional< Value * >(const IRPosition &, const AbstractAttribute *, bool &)> SimplifictionCallbackTy
Register CB as a simplification callback.
Simple wrapper for a single bit (boolean) state.
Support structure for SCC passes to communicate updates the call graph back to the CGSCC pass manager...
Helper to describe and deal with positions in the LLVM-IR.
static const IRPosition callsite_returned(const CallBase &CB)
Create a position describing the returned value of CB.
static const IRPosition returned(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the returned value of F.
static const IRPosition value(const Value &V, const CallBaseContext *CBContext=nullptr)
Create a position describing the value of V.
static const IRPosition inst(const Instruction &I, const CallBaseContext *CBContext=nullptr)
Create a position describing the instruction I.
@ IRP_ARGUMENT
An attribute for a function argument.
@ IRP_RETURNED
An attribute for the function return value.
@ IRP_CALL_SITE
An attribute for a call site (function scope).
@ IRP_CALL_SITE_RETURNED
An attribute for a call site return value.
@ IRP_FUNCTION
An attribute for a function (scope).
@ IRP_FLOAT
A position that is not associated with a spot suitable for attributes.
@ IRP_CALL_SITE_ARGUMENT
An attribute for a call site argument.
@ IRP_INVALID
An invalid position.
static const IRPosition function(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the function scope of F.
Kind getPositionKind() const
Return the associated position kind.
static const IRPosition callsite_function(const CallBase &CB)
Create a position describing the function scope of CB.
Function * getAnchorScope() const
Return the Function surrounding the anchor value.
bool isValidState() const override
See AbstractState::isValidState() NOTE: For now we simply pretend that the worst possible state is in...
ChangeStatus indicatePessimisticFixpoint() override
See AbstractState::indicatePessimisticFixpoint(...)
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
Helper to tie a abstract state implementation to an abstract attribute.
StateType & getState() override
See AbstractAttribute::getState(...).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...