49#include "llvm/IR/IntrinsicsAMDGPU.h"
50#include "llvm/IR/IntrinsicsNVPTX.h"
66#define DEBUG_TYPE "openmp-opt"
69 "openmp-opt-disable",
cl::desc(
"Disable OpenMP specific optimizations."),
73 "openmp-opt-enable-merging",
79 cl::desc(
"Disable function internalization."),
90 "openmp-hide-memory-transfer-latency",
91 cl::desc(
"[WIP] Tries to hide the latency of host to device memory"
96 "openmp-opt-disable-deglobalization",
97 cl::desc(
"Disable OpenMP optimizations involving deglobalization."),
101 "openmp-opt-disable-spmdization",
102 cl::desc(
"Disable OpenMP optimizations involving SPMD-ization."),
106 "openmp-opt-disable-folding",
111 "openmp-opt-disable-state-machine-rewrite",
112 cl::desc(
"Disable OpenMP optimizations that replace the state machine."),
116 "openmp-opt-disable-barrier-elimination",
117 cl::desc(
"Disable OpenMP optimizations that eliminate barriers."),
121 "openmp-opt-print-module-after",
122 cl::desc(
"Print the current module after OpenMP optimizations."),
126 "openmp-opt-print-module-before",
127 cl::desc(
"Print the current module before OpenMP optimizations."),
131 "openmp-opt-inline-device",
142 cl::desc(
"Maximal number of attributor iterations."),
147 cl::desc(
"Maximum amount of shared memory to use."),
148 cl::init(std::numeric_limits<unsigned>::max()));
151 "Number of OpenMP runtime calls deduplicated");
153 "Number of OpenMP parallel regions deleted");
155 "Number of OpenMP runtime functions identified");
157 "Number of OpenMP runtime function uses identified");
159 "Number of OpenMP target region entry points (=kernels) identified");
161 "Number of non-OpenMP target region kernels identified");
163 "Number of OpenMP target region entry points (=kernels) executed in "
164 "SPMD-mode instead of generic-mode");
165STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine,
166 "Number of OpenMP target region entry points (=kernels) executed in "
167 "generic-mode without a state machines");
168STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback,
169 "Number of OpenMP target region entry points (=kernels) executed in "
170 "generic-mode with customized state machines with fallback");
171STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback,
172 "Number of OpenMP target region entry points (=kernels) executed in "
173 "generic-mode with customized state machines without fallback");
175 NumOpenMPParallelRegionsReplacedInGPUStateMachine,
176 "Number of OpenMP parallel regions replaced with ID in GPU state machines");
178 "Number of OpenMP parallel regions merged");
180 "Amount of memory pushed to shared memory");
181STATISTIC(NumBarriersEliminated,
"Number of redundant barriers eliminated");
209#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX) \
210 constexpr const unsigned MEMBER##Idx = IDX;
215#undef KERNEL_ENVIRONMENT_IDX
217#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX) \
218 constexpr const unsigned MEMBER##Idx = IDX;
228#undef KERNEL_ENVIRONMENT_CONFIGURATION_IDX
230#define KERNEL_ENVIRONMENT_GETTER(MEMBER, RETURNTYPE) \
231 RETURNTYPE *get##MEMBER##FromKernelEnvironment(ConstantStruct *KernelEnvC) { \
232 return cast<RETURNTYPE>(KernelEnvC->getAggregateElement(MEMBER##Idx)); \
238#undef KERNEL_ENVIRONMENT_GETTER
240#define KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MEMBER) \
241 ConstantInt *get##MEMBER##FromKernelEnvironment( \
242 ConstantStruct *KernelEnvC) { \
243 ConstantStruct *ConfigC = \
244 getConfigurationFromKernelEnvironment(KernelEnvC); \
245 return dyn_cast<ConstantInt>(ConfigC->getAggregateElement(MEMBER##Idx)); \
256#undef KERNEL_ENVIRONMENT_CONFIGURATION_GETTER
260 constexpr const int InitKernelEnvironmentArgNo = 0;
261 return cast<GlobalVariable>(
275struct AAHeapToShared;
286 OpenMPPostLink(OpenMPPostLink) {
289 const Triple T(OMPBuilder.M.getTargetTriple());
290 switch (
T.getArch()) {
294 assert(OMPBuilder.Config.IsTargetDevice &&
295 "OpenMP AMDGPU/NVPTX is only prepared to deal with device code.");
296 OMPBuilder.Config.IsGPU =
true;
299 OMPBuilder.Config.IsGPU =
false;
302 OMPBuilder.initialize();
303 initializeRuntimeFunctions(M);
304 initializeInternalControlVars();
308 struct InternalControlVarInfo {
335 struct RuntimeFunctionInfo {
359 void clearUsesMap() { UsesMap.
clear(); }
362 operator bool()
const {
return Declaration; }
365 UseVector &getOrCreateUseVector(
Function *
F) {
366 std::shared_ptr<UseVector> &UV = UsesMap[
F];
368 UV = std::make_shared<UseVector>();
374 const UseVector *getUseVector(
Function &
F)
const {
375 auto I = UsesMap.find(&
F);
376 if (
I != UsesMap.end())
377 return I->second.get();
382 size_t getNumFunctionsWithUses()
const {
return UsesMap.size(); }
386 size_t getNumArgs()
const {
return ArgumentTypes.
size(); }
404 UseVector &UV = getOrCreateUseVector(
F);
414 while (!ToBeDeleted.
empty()) {
428 decltype(UsesMap)::iterator
begin() {
return UsesMap.
begin(); }
429 decltype(UsesMap)::iterator
end() {
return UsesMap.
end(); }
437 RuntimeFunction::OMPRTL___last>
445 InternalControlVar::ICV___last>
450 void initializeInternalControlVars() {
451#define ICV_RT_SET(_Name, RTL) \
453 auto &ICV = ICVs[_Name]; \
456#define ICV_RT_GET(Name, RTL) \
458 auto &ICV = ICVs[Name]; \
461#define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \
463 auto &ICV = ICVs[Enum]; \
466 ICV.InitKind = Init; \
467 ICV.EnvVarName = _EnvVarName; \
468 switch (ICV.InitKind) { \
469 case ICV_IMPLEMENTATION_DEFINED: \
470 ICV.InitValue = nullptr; \
473 ICV.InitValue = ConstantInt::get( \
474 Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \
477 ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \
483#include "llvm/Frontend/OpenMP/OMPKinds.def"
489 static bool declMatchesRTFTypes(
Function *
F,
Type *RTFRetType,
496 if (
F->getReturnType() != RTFRetType)
498 if (
F->arg_size() != RTFArgTypes.
size())
501 auto *RTFTyIt = RTFArgTypes.
begin();
503 if (Arg.getType() != *RTFTyIt)
513 unsigned collectUses(RuntimeFunctionInfo &RFI,
bool CollectStats =
true) {
514 unsigned NumUses = 0;
515 if (!RFI.Declaration)
520 NumOpenMPRuntimeFunctionsIdentified += 1;
521 NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
525 for (
Use &U : RFI.Declaration->uses()) {
526 if (
Instruction *UserI = dyn_cast<Instruction>(
U.getUser())) {
527 if (!
CGSCC ||
CGSCC->empty() ||
CGSCC->contains(UserI->getFunction())) {
528 RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
532 RFI.getOrCreateUseVector(
nullptr).push_back(&U);
541 auto &RFI = RFIs[RTF];
543 collectUses(RFI,
false);
547 void recollectUses() {
548 for (
int Idx = 0;
Idx < RFIs.size(); ++
Idx)
568 RuntimeFunctionInfo &RFI = RFIs[Fn];
570 if (RFI.Declaration && RFI.Declaration->isDeclaration())
578 void initializeRuntimeFunctions(
Module &M) {
581#define OMP_TYPE(VarName, ...) \
582 Type *VarName = OMPBuilder.VarName; \
585#define OMP_ARRAY_TYPE(VarName, ...) \
586 ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \
588 PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \
589 (void)VarName##PtrTy;
591#define OMP_FUNCTION_TYPE(VarName, ...) \
592 FunctionType *VarName = OMPBuilder.VarName; \
594 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
597#define OMP_STRUCT_TYPE(VarName, ...) \
598 StructType *VarName = OMPBuilder.VarName; \
600 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
603#define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \
605 SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \
606 Function *F = M.getFunction(_Name); \
607 RTLFunctions.insert(F); \
608 if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \
609 RuntimeFunctionIDMap[F] = _Enum; \
610 auto &RFI = RFIs[_Enum]; \
613 RFI.IsVarArg = _IsVarArg; \
614 RFI.ReturnType = OMPBuilder._ReturnType; \
615 RFI.ArgumentTypes = std::move(ArgsTypes); \
616 RFI.Declaration = F; \
617 unsigned NumUses = collectUses(RFI); \
620 dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \
622 if (RFI.Declaration) \
623 dbgs() << TAG << "-> got " << NumUses << " uses in " \
624 << RFI.getNumFunctionsWithUses() \
625 << " different functions.\n"; \
629#include "llvm/Frontend/OpenMP/OMPKinds.def"
635 for (
StringRef Prefix : {
"__kmpc",
"_ZN4ompx",
"omp_"})
636 if (
F.hasFnAttribute(Attribute::NoInline) &&
637 F.getName().starts_with(Prefix) &&
638 !
F.hasFnAttribute(Attribute::OptimizeNone))
639 F.removeFnAttr(Attribute::NoInline);
650 bool OpenMPPostLink =
false;
653template <
typename Ty,
bool InsertInval
idates = true>
655 bool contains(
const Ty &Elem)
const {
return Set.contains(Elem); }
656 bool insert(
const Ty &Elem) {
657 if (InsertInvalidates)
659 return Set.insert(Elem);
662 const Ty &operator[](
int Idx)
const {
return Set[
Idx]; }
663 bool operator==(
const BooleanStateWithSetVector &RHS)
const {
664 return BooleanState::operator==(RHS) &&
Set ==
RHS.Set;
666 bool operator!=(
const BooleanStateWithSetVector &RHS)
const {
667 return !(*
this ==
RHS);
670 bool empty()
const {
return Set.empty(); }
671 size_t size()
const {
return Set.size(); }
674 BooleanStateWithSetVector &
operator^=(
const BooleanStateWithSetVector &RHS) {
675 BooleanState::operator^=(RHS);
676 Set.insert(
RHS.Set.begin(),
RHS.Set.end());
685 typename decltype(
Set)::iterator
begin() {
return Set.begin(); }
686 typename decltype(
Set)::iterator
end() {
return Set.end(); }
691template <
typename Ty,
bool InsertInval
idates = true>
692using BooleanStateWithPtrSetVector =
693 BooleanStateWithSetVector<Ty *, InsertInvalidates>;
697 bool IsAtFixpoint =
false;
701 BooleanStateWithPtrSetVector<
CallBase,
false>
702 ReachedKnownParallelRegions;
705 BooleanStateWithPtrSetVector<CallBase> ReachedUnknownParallelRegions;
710 BooleanStateWithPtrSetVector<Instruction, false> SPMDCompatibilityTracker;
725 bool IsKernelEntry =
false;
728 BooleanStateWithPtrSetVector<Function, false> ReachingKernelEntries;
733 BooleanStateWithSetVector<uint8_t> ParallelLevels;
736 bool NestedParallelism =
false;
741 KernelInfoState() =
default;
742 KernelInfoState(
bool BestState) {
751 bool isAtFixpoint()
const override {
return IsAtFixpoint; }
756 ParallelLevels.indicatePessimisticFixpoint();
757 ReachingKernelEntries.indicatePessimisticFixpoint();
758 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
759 ReachedKnownParallelRegions.indicatePessimisticFixpoint();
760 ReachedUnknownParallelRegions.indicatePessimisticFixpoint();
761 NestedParallelism =
true;
768 ParallelLevels.indicateOptimisticFixpoint();
769 ReachingKernelEntries.indicateOptimisticFixpoint();
770 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
771 ReachedKnownParallelRegions.indicateOptimisticFixpoint();
772 ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
777 KernelInfoState &getAssumed() {
return *
this; }
778 const KernelInfoState &getAssumed()
const {
return *
this; }
780 bool operator==(
const KernelInfoState &RHS)
const {
781 if (SPMDCompatibilityTracker !=
RHS.SPMDCompatibilityTracker)
783 if (ReachedKnownParallelRegions !=
RHS.ReachedKnownParallelRegions)
785 if (ReachedUnknownParallelRegions !=
RHS.ReachedUnknownParallelRegions)
787 if (ReachingKernelEntries !=
RHS.ReachingKernelEntries)
789 if (ParallelLevels !=
RHS.ParallelLevels)
791 if (NestedParallelism !=
RHS.NestedParallelism)
797 bool mayContainParallelRegion() {
798 return !ReachedKnownParallelRegions.empty() ||
799 !ReachedUnknownParallelRegions.empty();
803 static KernelInfoState getBestState() {
return KernelInfoState(
true); }
805 static KernelInfoState getBestState(KernelInfoState &KIS) {
806 return getBestState();
810 static KernelInfoState getWorstState() {
return KernelInfoState(
false); }
813 KernelInfoState
operator^=(
const KernelInfoState &KIS) {
815 if (KIS.KernelInitCB) {
816 if (KernelInitCB && KernelInitCB != KIS.KernelInitCB)
819 KernelInitCB = KIS.KernelInitCB;
821 if (KIS.KernelDeinitCB) {
822 if (KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB)
825 KernelDeinitCB = KIS.KernelDeinitCB;
827 if (KIS.KernelEnvC) {
828 if (KernelEnvC && KernelEnvC != KIS.KernelEnvC)
831 KernelEnvC = KIS.KernelEnvC;
833 SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;
834 ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;
835 ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;
836 NestedParallelism |= KIS.NestedParallelism;
840 KernelInfoState
operator&=(
const KernelInfoState &KIS) {
841 return (*
this ^= KIS);
857 OffloadArray() =
default;
864 if (!
Array.getAllocatedType()->isArrayTy())
867 if (!getValues(Array,
Before))
870 this->Array = &
Array;
874 static const unsigned DeviceIDArgNum = 1;
875 static const unsigned BasePtrsArgNum = 3;
876 static const unsigned PtrsArgNum = 4;
877 static const unsigned SizesArgNum = 5;
885 const uint64_t NumValues =
Array.getAllocatedType()->getArrayNumElements();
886 StoredValues.
assign(NumValues,
nullptr);
887 LastAccesses.
assign(NumValues,
nullptr);
892 if (BB !=
Before.getParent())
902 if (!isa<StoreInst>(&
I))
905 auto *S = cast<StoreInst>(&
I);
912 LastAccesses[
Idx] = S;
922 const unsigned NumValues = StoredValues.
size();
923 for (
unsigned I = 0;
I < NumValues; ++
I) {
924 if (!StoredValues[
I] || !LastAccesses[
I])
934 using OptimizationRemarkGetter =
938 OptimizationRemarkGetter OREGetter,
939 OMPInformationCache &OMPInfoCache,
Attributor &A)
941 OREGetter(OREGetter), OMPInfoCache(OMPInfoCache),
A(
A) {}
944 bool remarksEnabled() {
945 auto &Ctx =
M.getContext();
946 return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(
DEBUG_TYPE);
950 bool run(
bool IsModulePass) {
954 bool Changed =
false;
960 Changed |= runAttributor(IsModulePass);
963 OMPInfoCache.recollectUses();
966 Changed |= rewriteDeviceCodeStateMachine();
968 if (remarksEnabled())
969 analysisGlobalization();
976 Changed |= runAttributor(IsModulePass);
979 OMPInfoCache.recollectUses();
981 Changed |= deleteParallelRegions();
984 Changed |= hideMemTransfersLatency();
985 Changed |= deduplicateRuntimeCalls();
987 if (mergeParallelRegions()) {
988 deduplicateRuntimeCalls();
994 if (OMPInfoCache.OpenMPPostLink)
995 Changed |= removeRuntimeSymbols();
1002 void printICVs()
const {
1007 for (
auto ICV : ICVs) {
1008 auto ICVInfo = OMPInfoCache.ICVs[ICV];
1010 return ORA <<
"OpenMP ICV " <<
ore::NV(
"OpenMPICV", ICVInfo.Name)
1012 << (ICVInfo.InitValue
1013 ?
toString(ICVInfo.InitValue->getValue(), 10,
true)
1014 :
"IMPLEMENTATION_DEFINED");
1017 emitRemark<OptimizationRemarkAnalysis>(
F,
"OpenMPICVTracker",
Remark);
1023 void printKernels()
const {
1029 return ORA <<
"OpenMP GPU kernel "
1030 <<
ore::NV(
"OpenMPGPUKernel",
F->getName()) <<
"\n";
1033 emitRemark<OptimizationRemarkAnalysis>(
F,
"OpenMPGPU",
Remark);
1039 static CallInst *getCallIfRegularCall(
1040 Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI =
nullptr) {
1041 CallInst *CI = dyn_cast<CallInst>(
U.getUser());
1051 static CallInst *getCallIfRegularCall(
1052 Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI =
nullptr) {
1053 CallInst *CI = dyn_cast<CallInst>(&V);
1063 bool mergeParallelRegions() {
1064 const unsigned CallbackCalleeOperand = 2;
1065 const unsigned CallbackFirstArgOperand = 3;
1069 OMPInformationCache::RuntimeFunctionInfo &RFI =
1070 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1072 if (!RFI.Declaration)
1076 OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
1077 OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],
1078 OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],
1081 bool Changed =
false;
1087 BasicBlock *StartBB =
nullptr, *EndBB =
nullptr;
1088 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
1089 BasicBlock *CGStartBB = CodeGenIP.getBlock();
1091 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
1092 assert(StartBB !=
nullptr &&
"StartBB should not be null");
1094 assert(EndBB !=
nullptr &&
"EndBB should not be null");
1095 EndBB->getTerminator()->setSuccessor(0, CGEndBB);
1099 auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
Value &,
1100 Value &Inner,
Value *&ReplacementValue) -> InsertPointTy {
1101 ReplacementValue = &Inner;
1105 auto FiniCB = [&](InsertPointTy CodeGenIP) {
return Error::success(); };
1109 auto CreateSequentialRegion = [&](
Function *OuterFn,
1117 SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);
1121 SplitBlock(ParentBB, SeqStartI, DT, LI,
nullptr,
"seq.par.merged");
1124 "Expected a different CFG");
1128 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
1129 BasicBlock *CGStartBB = CodeGenIP.getBlock();
1131 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
1132 assert(SeqStartBB !=
nullptr &&
"SeqStartBB should not be null");
1134 assert(SeqEndBB !=
nullptr &&
"SeqEndBB should not be null");
1138 auto FiniCB = [&](InsertPointTy CodeGenIP) {
return Error::success(); };
1144 for (
User *Usr :
I.users()) {
1152 OutsideUsers.
insert(&UsrI);
1155 if (OutsideUsers.
empty())
1162 I.getType(),
DL.getAllocaAddrSpace(),
nullptr,
1163 I.getName() +
".seq.output.alloc", OuterFn->
front().
begin());
1167 new StoreInst(&
I, AllocaI, SeqStartBB->getTerminator()->getIterator());
1173 I.getName() +
".seq.output.load",
1180 InsertPointTy(ParentBB, ParentBB->
end()),
DL);
1182 OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB));
1184 OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel));
1203 assert(MergableCIs.
size() > 1 &&
"Assumed multiple mergable CIs");
1206 OR <<
"Parallel region merged with parallel region"
1207 << (MergableCIs.
size() > 2 ?
"s" :
"") <<
" at ";
1210 if (CI != MergableCIs.
back())
1216 emitRemark<OptimizationRemark>(MergableCIs.
front(),
"OMP150",
Remark);
1220 <<
" parallel regions in " << OriginalFn->
getName()
1224 EndBB =
SplitBlock(BB, MergableCIs.
back()->getNextNode(), DT, LI);
1226 SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
1230 assert(BB->getUniqueSuccessor() == StartBB &&
"Expected a different CFG");
1231 const DebugLoc DL = BB->getTerminator()->getDebugLoc();
1236 for (
auto *It = MergableCIs.
begin(), *
End = MergableCIs.
end() - 1;
1245 CreateSequentialRegion(OriginalFn, BB, ForkCI->
getNextNode(),
1257 cantFail(OMPInfoCache.OMPBuilder.createParallel(
1258 Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB,
nullptr,
nullptr,
1259 OMP_PROC_BIND_default,
false));
1263 OMPInfoCache.OMPBuilder.finalize(OriginalFn);
1270 for (
auto *CI : MergableCIs) {
1272 FunctionType *FT = OMPInfoCache.OMPBuilder.ParallelTask;
1276 for (
unsigned U = CallbackFirstArgOperand, E = CI->
arg_size(); U < E;
1286 for (
unsigned U = CallbackFirstArgOperand, E = CI->
arg_size(); U < E;
1290 U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
1293 if (CI != MergableCIs.back()) {
1296 cantFail(OMPInfoCache.OMPBuilder.createBarrier(
1305 assert(OutlinedFn != OriginalFn &&
"Outlining failed");
1306 CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn);
1307 CGUpdater.reanalyzeFunction(*OriginalFn);
1309 NumOpenMPParallelRegionsMerged += MergableCIs.size();
1317 CallInst *CI = getCallIfRegularCall(U, &RFI);
1324 RFI.foreachUse(SCC, DetectPRsCB);
1330 for (
auto &It : BB2PRMap) {
1331 auto &CIs = It.getSecond();
1346 auto IsMergable = [&](
Instruction &
I,
bool IsBeforeMergableRegion) {
1349 if (
I.isTerminator())
1352 if (!isa<CallInst>(&
I))
1356 if (IsBeforeMergableRegion) {
1358 if (!CalledFunction)
1365 for (
const auto &RFI : UnmergableCallsInfo) {
1366 if (CalledFunction == RFI.Declaration)
1374 if (!isa<IntrinsicInst>(CI))
1385 if (CIs.count(&
I)) {
1391 if (IsMergable(
I, MergableCIs.
empty()))
1396 for (; It !=
End; ++It) {
1398 if (CIs.count(&SkipI)) {
1400 <<
" due to " <<
I <<
"\n");
1407 if (MergableCIs.
size() > 1) {
1408 MergableCIsVector.
push_back(MergableCIs);
1410 <<
" parallel regions in block " << BB->
getName()
1415 MergableCIs.
clear();
1418 if (!MergableCIsVector.
empty()) {
1421 for (
auto &MergableCIs : MergableCIsVector)
1422 Merge(MergableCIs, BB);
1423 MergableCIsVector.clear();
1430 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call);
1431 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier);
1432 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master);
1433 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master);
1440 bool deleteParallelRegions() {
1441 const unsigned CallbackCalleeOperand = 2;
1443 OMPInformationCache::RuntimeFunctionInfo &RFI =
1444 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1446 if (!RFI.Declaration)
1449 bool Changed =
false;
1451 CallInst *CI = getCallIfRegularCall(U);
1454 auto *Fn = dyn_cast<Function>(
1458 if (!Fn->onlyReadsMemory())
1460 if (!Fn->hasFnAttribute(Attribute::WillReturn))
1467 return OR <<
"Removing parallel region with no side-effects.";
1469 emitRemark<OptimizationRemark>(CI,
"OMP160",
Remark);
1473 ++NumOpenMPParallelRegionsDeleted;
1477 RFI.foreachUse(SCC, DeleteCallCB);
1483 bool deduplicateRuntimeCalls() {
1484 bool Changed =
false;
1487 OMPRTL_omp_get_num_threads,
1488 OMPRTL_omp_in_parallel,
1489 OMPRTL_omp_get_cancellation,
1490 OMPRTL_omp_get_supported_active_levels,
1491 OMPRTL_omp_get_level,
1492 OMPRTL_omp_get_ancestor_thread_num,
1493 OMPRTL_omp_get_team_size,
1494 OMPRTL_omp_get_active_level,
1495 OMPRTL_omp_in_final,
1496 OMPRTL_omp_get_proc_bind,
1497 OMPRTL_omp_get_num_places,
1498 OMPRTL_omp_get_num_procs,
1499 OMPRTL_omp_get_place_num,
1500 OMPRTL_omp_get_partition_num_places,
1501 OMPRTL_omp_get_partition_place_nums};
1505 collectGlobalThreadIdArguments(GTIdArgs);
1507 <<
" global thread ID arguments\n");
1510 for (
auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
1511 Changed |= deduplicateRuntimeCalls(
1512 *
F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
1516 Value *GTIdArg =
nullptr;
1518 if (GTIdArgs.
count(&Arg)) {
1522 Changed |= deduplicateRuntimeCalls(
1523 *
F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);
1530 bool removeRuntimeSymbols() {
1536 if (
GV->getNumUses() >= 1)
1540 GV->eraseFromParent();
1552 bool hideMemTransfersLatency() {
1553 auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
1554 bool Changed =
false;
1556 auto *RTCall = getCallIfRegularCall(U, &RFI);
1560 OffloadArray OffloadArrays[3];
1561 if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
1564 LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));
1567 bool WasSplit =
false;
1568 Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
1569 if (WaitMovementPoint)
1570 WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);
1572 Changed |= WasSplit;
1575 if (OMPInfoCache.runtimeFnsAvailable(
1576 {OMPRTL___tgt_target_data_begin_mapper_issue,
1577 OMPRTL___tgt_target_data_begin_mapper_wait}))
1578 RFI.foreachUse(SCC, SplitMemTransfers);
1583 void analysisGlobalization() {
1584 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
1586 auto CheckGlobalization = [&](
Use &
U,
Function &Decl) {
1587 if (
CallInst *CI = getCallIfRegularCall(U, &RFI)) {
1590 <<
"Found thread data sharing on the GPU. "
1591 <<
"Expect degraded performance due to data globalization.";
1593 emitRemark<OptimizationRemarkMissed>(CI,
"OMP112",
Remark);
1599 RFI.foreachUse(SCC, CheckGlobalization);
1604 bool getValuesInOffloadArrays(
CallInst &RuntimeCall,
1606 assert(OAs.
size() == 3 &&
"Need space for three offload arrays!");
1616 Value *BasePtrsArg =
1625 if (!isa<AllocaInst>(V))
1627 auto *BasePtrsArray = cast<AllocaInst>(V);
1628 if (!OAs[0].
initialize(*BasePtrsArray, RuntimeCall))
1633 if (!isa<AllocaInst>(V))
1635 auto *PtrsArray = cast<AllocaInst>(V);
1636 if (!OAs[1].
initialize(*PtrsArray, RuntimeCall))
1642 if (isa<GlobalValue>(V))
1643 return isa<Constant>(V);
1644 if (!isa<AllocaInst>(V))
1647 auto *SizesArray = cast<AllocaInst>(V);
1648 if (!OAs[2].
initialize(*SizesArray, RuntimeCall))
1659 assert(OAs.
size() == 3 &&
"There are three offload arrays to debug!");
1662 std::string ValuesStr;
1664 std::string Separator =
" --- ";
1666 for (
auto *BP : OAs[0].StoredValues) {
1670 LLVM_DEBUG(
dbgs() <<
"\t\toffload_baseptrs: " << ValuesStr <<
"\n");
1673 for (
auto *
P : OAs[1].StoredValues) {
1680 for (
auto *S : OAs[2].StoredValues) {
1684 LLVM_DEBUG(
dbgs() <<
"\t\toffload_sizes: " << ValuesStr <<
"\n");
1694 bool IsWorthIt =
false;
1713 return RuntimeCall.
getParent()->getTerminator();
1717 bool splitTargetDataBeginRTC(
CallInst &RuntimeCall,
1722 auto &
IRBuilder = OMPInfoCache.OMPBuilder;
1726 Entry.getFirstNonPHIOrDbgOrAlloca());
1728 IRBuilder.AsyncInfo,
nullptr,
"handle");
1736 M, OMPRTL___tgt_target_data_begin_mapper_issue);
1740 for (
auto &Arg : RuntimeCall.
args())
1741 Args.push_back(Arg.get());
1742 Args.push_back(Handle);
1746 OMPInfoCache.setCallingConvention(IssueDecl, IssueCallsite);
1752 M, OMPRTL___tgt_target_data_begin_mapper_wait);
1754 Value *WaitParams[2] = {
1756 OffloadArray::DeviceIDArgNum),
1760 WaitDecl, WaitParams,
"", WaitMovementPoint.
getIterator());
1761 OMPInfoCache.setCallingConvention(WaitDecl, WaitCallsite);
1766 static Value *combinedIdentStruct(
Value *CurrentIdent,
Value *NextIdent,
1767 bool GlobalOnly,
bool &SingleChoice) {
1768 if (CurrentIdent == NextIdent)
1769 return CurrentIdent;
1773 if (!GlobalOnly || isa<GlobalValue>(NextIdent)) {
1774 SingleChoice = !CurrentIdent;
1786 getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
1788 bool SingleChoice =
true;
1789 Value *Ident =
nullptr;
1791 CallInst *CI = getCallIfRegularCall(U, &RFI);
1792 if (!CI || &
F != &Caller)
1795 true, SingleChoice);
1798 RFI.foreachUse(SCC, CombineIdentStruct);
1800 if (!Ident || !SingleChoice) {
1803 if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
1805 &
F.getEntryBlock(),
F.getEntryBlock().begin()));
1810 OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1811 Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc, SrcLocStrSize);
1818 bool deduplicateRuntimeCalls(
Function &
F,
1819 OMPInformationCache::RuntimeFunctionInfo &RFI,
1820 Value *ReplVal =
nullptr) {
1821 auto *UV = RFI.getUseVector(
F);
1822 if (!UV || UV->size() + (ReplVal !=
nullptr) < 2)
1826 dbgs() <<
TAG <<
"Deduplicate " << UV->size() <<
" uses of " << RFI.Name
1827 << (ReplVal ?
" with an existing value\n" :
"\n") <<
"\n");
1829 assert((!ReplVal || (isa<Argument>(ReplVal) &&
1830 cast<Argument>(ReplVal)->
getParent() == &
F)) &&
1831 "Unexpected replacement value!");
1834 auto CanBeMoved = [
this](
CallBase &CB) {
1835 unsigned NumArgs = CB.arg_size();
1838 if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
1840 for (
unsigned U = 1;
U < NumArgs; ++
U)
1841 if (isa<Instruction>(CB.getArgOperand(U)))
1852 for (
Use *U : *UV) {
1853 if (
CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
1858 if (!CanBeMoved(*CI))
1866 assert(IP &&
"Expected insertion point!");
1867 cast<Instruction>(ReplVal)->moveBefore(IP);
1873 if (
CallBase *CI = dyn_cast<CallBase>(ReplVal)) {
1876 Value *Ident = getCombinedIdentFromCallUsesIn(RFI,
F,
1882 bool Changed =
false;
1884 CallInst *CI = getCallIfRegularCall(U, &RFI);
1885 if (!CI || CI == ReplVal || &
F != &Caller)
1890 return OR <<
"OpenMP runtime call "
1891 <<
ore::NV(
"OpenMPOptRuntime", RFI.Name) <<
" deduplicated.";
1894 emitRemark<OptimizationRemark>(CI,
"OMP170",
Remark);
1896 emitRemark<OptimizationRemark>(&
F,
"OMP170",
Remark);
1900 ++NumOpenMPRuntimeCallsDeduplicated;
1904 RFI.foreachUse(SCC, ReplaceAndDeleteCB);
1918 if (!
F.hasLocalLinkage())
1920 for (
Use &U :
F.uses()) {
1921 if (
CallInst *CI = getCallIfRegularCall(U)) {
1923 if (CI == &RefCI || GTIdArgs.
count(ArgOp) ||
1924 getCallIfRegularCall(
1925 *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))
1934 auto AddUserArgs = [&](
Value >Id) {
1935 for (
Use &U : GTId.uses())
1936 if (
CallInst *CI = dyn_cast<CallInst>(
U.getUser()))
1939 if (CallArgOpIsGTId(*Callee,
U.getOperandNo(), *CI))
1944 OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
1945 OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];
1947 GlobThreadNumRFI.foreachUse(SCC, [&](
Use &U,
Function &
F) {
1948 if (
CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))
1956 for (
unsigned U = 0;
U < GTIdArgs.
size(); ++
U)
1957 AddUserArgs(*GTIdArgs[U]);
1972 return getUniqueKernelFor(*
I.getFunction());
1977 bool rewriteDeviceCodeStateMachine();
1993 template <
typename RemarkKind,
typename RemarkCallBack>
1995 RemarkCallBack &&RemarkCB)
const {
1997 auto &ORE = OREGetter(
F);
2001 return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
I))
2002 <<
" [" << RemarkName <<
"]";
2006 [&]() {
return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
I)); });
2010 template <
typename RemarkKind,
typename RemarkCallBack>
2012 RemarkCallBack &&RemarkCB)
const {
2013 auto &ORE = OREGetter(
F);
2017 return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
F))
2018 <<
" [" << RemarkName <<
"]";
2022 [&]() {
return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
F)); });
2036 OptimizationRemarkGetter OREGetter;
2039 OMPInformationCache &OMPInfoCache;
2045 bool runAttributor(
bool IsModulePass) {
2049 registerAAs(IsModulePass);
2054 <<
" functions, result: " << Changed <<
".\n");
2056 if (Changed == ChangeStatus::CHANGED)
2057 OMPInfoCache.invalidateAnalyses();
2059 return Changed == ChangeStatus::CHANGED;
2066 void registerAAs(
bool IsModulePass);
2075 if (OMPInfoCache.CGSCC && !OMPInfoCache.CGSCC->empty() &&
2076 !OMPInfoCache.CGSCC->contains(&
F))
2081 std::optional<Kernel> &CachedKernel = UniqueKernelMap[&
F];
2083 return *CachedKernel;
2090 return *CachedKernel;
2093 CachedKernel =
nullptr;
2094 if (!
F.hasLocalLinkage()) {
2098 return ORA <<
"Potentially unknown OpenMP target region caller.";
2100 emitRemark<OptimizationRemarkAnalysis>(&
F,
"OMP100",
Remark);
2106 auto GetUniqueKernelForUse = [&](
const Use &
U) ->
Kernel {
2107 if (
auto *Cmp = dyn_cast<ICmpInst>(
U.getUser())) {
2109 if (
Cmp->isEquality())
2110 return getUniqueKernelFor(*Cmp);
2113 if (
auto *CB = dyn_cast<CallBase>(
U.getUser())) {
2115 if (CB->isCallee(&U))
2116 return getUniqueKernelFor(*CB);
2118 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
2119 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
2121 if (OpenMPOpt::getCallIfRegularCall(*
U.getUser(), &KernelParallelRFI))
2122 return getUniqueKernelFor(*CB);
2131 OMPInformationCache::foreachUse(
F, [&](
const Use &U) {
2132 PotentialKernels.
insert(GetUniqueKernelForUse(U));
2136 if (PotentialKernels.
size() == 1)
2137 K = *PotentialKernels.
begin();
2140 UniqueKernelMap[&
F] =
K;
2145bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
2146 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
2147 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
2149 bool Changed =
false;
2150 if (!KernelParallelRFI)
2161 bool UnknownUse =
false;
2162 bool KernelParallelUse =
false;
2163 unsigned NumDirectCalls = 0;
2166 OMPInformationCache::foreachUse(*
F, [&](
Use &U) {
2167 if (
auto *CB = dyn_cast<CallBase>(
U.getUser()))
2168 if (CB->isCallee(&U)) {
2173 if (isa<ICmpInst>(
U.getUser())) {
2174 ToBeReplacedStateMachineUses.push_back(&U);
2180 OpenMPOpt::getCallIfRegularCall(*
U.getUser(), &KernelParallelRFI);
2181 const unsigned int WrapperFunctionArgNo = 6;
2182 if (!KernelParallelUse && CI &&
2184 KernelParallelUse = true;
2185 ToBeReplacedStateMachineUses.push_back(&U);
2193 if (!KernelParallelUse)
2199 if (UnknownUse || NumDirectCalls != 1 ||
2200 ToBeReplacedStateMachineUses.
size() > 2) {
2202 return ORA <<
"Parallel region is used in "
2203 << (UnknownUse ?
"unknown" :
"unexpected")
2204 <<
" ways. Will not attempt to rewrite the state machine.";
2206 emitRemark<OptimizationRemarkAnalysis>(
F,
"OMP101",
Remark);
2215 return ORA <<
"Parallel region is not called from a unique kernel. "
2216 "Will not attempt to rewrite the state machine.";
2218 emitRemark<OptimizationRemarkAnalysis>(
F,
"OMP102",
Remark);
2234 for (
Use *U : ToBeReplacedStateMachineUses)
2236 ID,
U->get()->getType()));
2238 ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
2247struct AAICVTracker :
public StateWrapper<BooleanState, AbstractAttribute> {
2252 bool isAssumedTracked()
const {
return getAssumed(); }
2255 bool isKnownTracked()
const {
return getAssumed(); }
2264 return std::nullopt;
2270 virtual std::optional<Value *>
2278 const std::string
getName()
const override {
return "AAICVTracker"; }
2281 const char *getIdAddr()
const override {
return &
ID; }
2288 static const char ID;
2291struct AAICVTrackerFunction :
public AAICVTracker {
2293 : AAICVTracker(IRP,
A) {}
2296 const std::string getAsStr(
Attributor *)
const override {
2297 return "ICVTrackerFunction";
2301 void trackStatistics()
const override {}
2305 return ChangeStatus::UNCHANGED;
2310 InternalControlVar::ICV___last>
2311 ICVReplacementValuesMap;
2318 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2321 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2323 auto &ValuesMap = ICVReplacementValuesMap[ICV];
2325 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
2331 if (ValuesMap.insert(std::make_pair(CI, CI->
getArgOperand(0))).second)
2332 HasChanged = ChangeStatus::CHANGED;
2338 std::optional<Value *> ReplVal = getValueForCall(
A,
I, ICV);
2339 if (ReplVal && ValuesMap.insert(std::make_pair(&
I, *ReplVal)).second)
2340 HasChanged = ChangeStatus::CHANGED;
2346 SetterRFI.foreachUse(TrackValues,
F);
2348 bool UsedAssumedInformation =
false;
2349 A.checkForAllInstructions(CallCheck, *
this, {Instruction::Call},
2350 UsedAssumedInformation,
2356 if (HasChanged == ChangeStatus::CHANGED)
2357 ValuesMap.try_emplace(Entry);
2368 const auto *CB = dyn_cast<CallBase>(&
I);
2369 if (!CB || CB->hasFnAttr(
"no_openmp") ||
2370 CB->hasFnAttr(
"no_openmp_routines"))
2371 return std::nullopt;
2373 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2374 auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
2375 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2376 Function *CalledFunction = CB->getCalledFunction();
2379 if (CalledFunction ==
nullptr)
2381 if (CalledFunction == GetterRFI.Declaration)
2382 return std::nullopt;
2383 if (CalledFunction == SetterRFI.Declaration) {
2384 if (ICVReplacementValuesMap[ICV].
count(&
I))
2385 return ICVReplacementValuesMap[ICV].
lookup(&
I);
2394 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2397 if (ICVTrackingAA->isAssumedTracked()) {
2398 std::optional<Value *> URV =
2399 ICVTrackingAA->getUniqueReplacementValue(ICV);
2410 std::optional<Value *>
2412 return std::nullopt;
2419 const auto &ValuesMap = ICVReplacementValuesMap[ICV];
2420 if (ValuesMap.count(
I))
2421 return ValuesMap.lookup(
I);
2427 std::optional<Value *> ReplVal;
2429 while (!Worklist.
empty()) {
2431 if (!Visited.
insert(CurrInst).second)
2439 if (ValuesMap.count(CurrInst)) {
2440 std::optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst);
2443 ReplVal = NewReplVal;
2449 if (ReplVal != NewReplVal)
2455 std::optional<Value *> NewReplVal = getValueForCall(
A, *CurrInst, ICV);
2461 ReplVal = NewReplVal;
2467 if (ReplVal != NewReplVal)
2472 if (CurrBB ==
I->getParent() && ReplVal)
2477 if (
const Instruction *Terminator = Pred->getTerminator())
2485struct AAICVTrackerFunctionReturned : AAICVTracker {
2487 : AAICVTracker(IRP,
A) {}
2490 const std::string getAsStr(
Attributor *)
const override {
2491 return "ICVTrackerFunctionReturned";
2495 void trackStatistics()
const override {}
2499 return ChangeStatus::UNCHANGED;
2504 InternalControlVar::ICV___last>
2505 ICVReplacementValuesMap;
2508 std::optional<Value *>
2510 return ICVReplacementValuesMap[ICV];
2515 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2518 if (!ICVTrackingAA->isAssumedTracked())
2519 return indicatePessimisticFixpoint();
2522 std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2523 std::optional<Value *> UniqueICVValue;
2526 std::optional<Value *> NewReplVal =
2527 ICVTrackingAA->getReplacementValue(ICV, &
I,
A);
2530 if (UniqueICVValue && UniqueICVValue != NewReplVal)
2533 UniqueICVValue = NewReplVal;
2538 bool UsedAssumedInformation =
false;
2539 if (!
A.checkForAllInstructions(CheckReturnInst, *
this, {Instruction::Ret},
2540 UsedAssumedInformation,
2542 UniqueICVValue =
nullptr;
2544 if (UniqueICVValue == ReplVal)
2547 ReplVal = UniqueICVValue;
2548 Changed = ChangeStatus::CHANGED;
2555struct AAICVTrackerCallSite : AAICVTracker {
2557 : AAICVTracker(IRP,
A) {}
2560 assert(getAnchorScope() &&
"Expected anchor function");
2564 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2566 auto ICVInfo = OMPInfoCache.ICVs[ICV];
2567 auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter];
2568 if (Getter.Declaration == getAssociatedFunction()) {
2569 AssociatedICV = ICVInfo.Kind;
2575 indicatePessimisticFixpoint();
2579 if (!ReplVal || !*ReplVal)
2580 return ChangeStatus::UNCHANGED;
2583 A.deleteAfterManifest(*getCtxI());
2585 return ChangeStatus::CHANGED;
2589 const std::string getAsStr(
Attributor *)
const override {
2590 return "ICVTrackerCallSite";
2594 void trackStatistics()
const override {}
2597 std::optional<Value *> ReplVal;
2600 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2604 if (!ICVTrackingAA->isAssumedTracked())
2605 return indicatePessimisticFixpoint();
2607 std::optional<Value *> NewReplVal =
2608 ICVTrackingAA->getReplacementValue(AssociatedICV, getCtxI(),
A);
2610 if (ReplVal == NewReplVal)
2611 return ChangeStatus::UNCHANGED;
2613 ReplVal = NewReplVal;
2614 return ChangeStatus::CHANGED;
2619 std::optional<Value *>
2625struct AAICVTrackerCallSiteReturned : AAICVTracker {
2627 : AAICVTracker(IRP,
A) {}
2630 const std::string getAsStr(
Attributor *)
const override {
2631 return "ICVTrackerCallSiteReturned";
2635 void trackStatistics()
const override {}
2639 return ChangeStatus::UNCHANGED;
2644 InternalControlVar::ICV___last>
2645 ICVReplacementValuesMap;
2649 std::optional<Value *>
2651 return ICVReplacementValuesMap[ICV];
2656 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2658 DepClassTy::REQUIRED);
2661 if (!ICVTrackingAA->isAssumedTracked())
2662 return indicatePessimisticFixpoint();
2665 std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2666 std::optional<Value *> NewReplVal =
2667 ICVTrackingAA->getUniqueReplacementValue(ICV);
2669 if (ReplVal == NewReplVal)
2672 ReplVal = NewReplVal;
2673 Changed = ChangeStatus::CHANGED;
2681static bool hasFunctionEndAsUniqueSuccessor(
const BasicBlock *BB) {
2687 return hasFunctionEndAsUniqueSuccessor(
Successor);
2694 ~AAExecutionDomainFunction() {
delete RPOT; }
2698 assert(
F &&
"Expected anchor function");
2703 unsigned TotalBlocks = 0, InitialThreadBlocks = 0, AlignedBlocks = 0;
2704 for (
auto &It : BEDMap) {
2708 InitialThreadBlocks += It.getSecond().IsExecutedByInitialThreadOnly;
2709 AlignedBlocks += It.getSecond().IsReachedFromAlignedBarrierOnly &&
2710 It.getSecond().IsReachingAlignedBarrierOnly;
2712 return "[AAExecutionDomain] " + std::to_string(InitialThreadBlocks) +
"/" +
2713 std::to_string(AlignedBlocks) +
" of " +
2714 std::to_string(TotalBlocks) +
2715 " executed by initial thread / aligned";
2727 << BB.
getName() <<
" is executed by a single thread.\n";
2737 auto HandleAlignedBarrier = [&](
CallBase *CB) {
2738 const ExecutionDomainTy &ED = CB ? CEDMap[{CB, PRE}] : BEDMap[
nullptr];
2739 if (!ED.IsReachedFromAlignedBarrierOnly ||
2740 ED.EncounteredNonLocalSideEffect)
2742 if (!ED.EncounteredAssumes.empty() && !
A.isModulePass())
2753 DeletedBarriers.
insert(CB);
2754 A.deleteAfterManifest(*CB);
2755 ++NumBarriersEliminated;
2757 }
else if (!ED.AlignedBarriers.empty()) {
2760 ED.AlignedBarriers.end());
2762 while (!Worklist.
empty()) {
2764 if (!Visited.
insert(LastCB))
2768 if (!hasFunctionEndAsUniqueSuccessor(LastCB->
getParent()))
2770 if (!DeletedBarriers.
count(LastCB)) {
2771 ++NumBarriersEliminated;
2772 A.deleteAfterManifest(*LastCB);
2778 const ExecutionDomainTy &LastED = CEDMap[{LastCB, PRE}];
2779 Worklist.
append(LastED.AlignedBarriers.begin(),
2780 LastED.AlignedBarriers.end());
2786 if (!ED.EncounteredAssumes.empty() && (CB || !ED.AlignedBarriers.empty()))
2787 for (
auto *AssumeCB : ED.EncounteredAssumes)
2788 A.deleteAfterManifest(*AssumeCB);
2791 for (
auto *CB : AlignedBarriers)
2792 HandleAlignedBarrier(CB);
2796 HandleAlignedBarrier(
nullptr);
2808 mergeInPredecessorBarriersAndAssumptions(
Attributor &
A, ExecutionDomainTy &ED,
2809 const ExecutionDomainTy &PredED);
2814 bool mergeInPredecessor(
Attributor &
A, ExecutionDomainTy &ED,
2815 const ExecutionDomainTy &PredED,
2816 bool InitialEdgeOnly =
false);
2819 bool handleCallees(
Attributor &
A, ExecutionDomainTy &EntryBBED);
2829 assert(BB.
getParent() == getAnchorScope() &&
"Block is out of scope!");
2830 return BEDMap.lookup(&BB).IsExecutedByInitialThreadOnly;
2835 assert(
I.getFunction() == getAnchorScope() &&
2836 "Instruction is out of scope!");
2840 bool ForwardIsOk =
true;
2846 auto *CB = dyn_cast<CallBase>(CurI);
2849 if (CB != &
I && AlignedBarriers.contains(
const_cast<CallBase *
>(CB)))
2851 const auto &It = CEDMap.find({CB, PRE});
2852 if (It == CEDMap.end())
2854 if (!It->getSecond().IsReachingAlignedBarrierOnly)
2855 ForwardIsOk =
false;
2859 if (!CurI && !BEDMap.lookup(
I.getParent()).IsReachingAlignedBarrierOnly)
2860 ForwardIsOk =
false;
2865 auto *CB = dyn_cast<CallBase>(CurI);
2868 if (CB != &
I && AlignedBarriers.contains(
const_cast<CallBase *
>(CB)))
2870 const auto &It = CEDMap.find({CB, POST});
2871 if (It == CEDMap.end())
2873 if (It->getSecond().IsReachedFromAlignedBarrierOnly)
2886 return BEDMap.lookup(
nullptr).IsReachedFromAlignedBarrierOnly;
2888 return BEDMap.lookup(PredBB).IsReachedFromAlignedBarrierOnly;
2900 "No request should be made against an invalid state!");
2901 return BEDMap.lookup(&BB);
2903 std::pair<ExecutionDomainTy, ExecutionDomainTy>
2906 "No request should be made against an invalid state!");
2907 return {CEDMap.lookup({&CB, PRE}), CEDMap.lookup({&CB, POST})};
2911 "No request should be made against an invalid state!");
2912 return InterProceduralED;
2926 if (!Cmp || !
Cmp->isTrueWhenEqual() || !
Cmp->isEquality())
2934 if (
C->isAllOnesValue()) {
2935 auto *CB = dyn_cast<CallBase>(
Cmp->getOperand(0));
2936 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2937 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
2938 CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr;
2944 KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);
2950 if (
auto *
II = dyn_cast<IntrinsicInst>(
Cmp->getOperand(0)))
2951 if (
II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_tid_x)
2955 if (
auto *
II = dyn_cast<IntrinsicInst>(
Cmp->getOperand(0)))
2956 if (
II->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x)
2964 ExecutionDomainTy InterProceduralED;
2976 static bool setAndRecord(
bool &R,
bool V) {
2987void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions(
2988 Attributor &
A, ExecutionDomainTy &ED,
const ExecutionDomainTy &PredED) {
2989 for (
auto *EA : PredED.EncounteredAssumes)
2990 ED.addAssumeInst(
A, *EA);
2992 for (
auto *AB : PredED.AlignedBarriers)
2993 ED.addAlignedBarrier(
A, *AB);
2996bool AAExecutionDomainFunction::mergeInPredecessor(
2997 Attributor &
A, ExecutionDomainTy &ED,
const ExecutionDomainTy &PredED,
2998 bool InitialEdgeOnly) {
3000 bool Changed =
false;
3002 setAndRecord(ED.IsExecutedByInitialThreadOnly,
3003 InitialEdgeOnly || (PredED.IsExecutedByInitialThreadOnly &&
3004 ED.IsExecutedByInitialThreadOnly));
3006 Changed |= setAndRecord(ED.IsReachedFromAlignedBarrierOnly,
3007 ED.IsReachedFromAlignedBarrierOnly &&
3008 PredED.IsReachedFromAlignedBarrierOnly);
3009 Changed |= setAndRecord(ED.EncounteredNonLocalSideEffect,
3010 ED.EncounteredNonLocalSideEffect |
3011 PredED.EncounteredNonLocalSideEffect);
3013 if (ED.IsReachedFromAlignedBarrierOnly)
3014 mergeInPredecessorBarriersAndAssumptions(
A, ED, PredED);
3016 ED.clearAssumeInstAndAlignedBarriers();
3020bool AAExecutionDomainFunction::handleCallees(
Attributor &
A,
3021 ExecutionDomainTy &EntryBBED) {
3026 DepClassTy::OPTIONAL);
3027 if (!EDAA || !EDAA->getState().isValidState())
3030 EDAA->getExecutionDomain(*cast<CallBase>(ACS.getInstruction())));
3034 ExecutionDomainTy ExitED;
3035 bool AllCallSitesKnown;
3036 if (
A.checkForAllCallSites(PredForCallSite, *
this,
3038 AllCallSitesKnown)) {
3039 for (
const auto &[CSInED, CSOutED] : CallSiteEDs) {
3040 mergeInPredecessor(
A, EntryBBED, CSInED);
3041 ExitED.IsReachingAlignedBarrierOnly &=
3042 CSOutED.IsReachingAlignedBarrierOnly;
3049 EntryBBED.IsExecutedByInitialThreadOnly =
false;
3050 EntryBBED.IsReachedFromAlignedBarrierOnly =
true;
3051 EntryBBED.EncounteredNonLocalSideEffect =
false;
3052 ExitED.IsReachingAlignedBarrierOnly =
false;
3054 EntryBBED.IsExecutedByInitialThreadOnly =
false;
3055 EntryBBED.IsReachedFromAlignedBarrierOnly =
false;
3056 EntryBBED.EncounteredNonLocalSideEffect =
true;
3057 ExitED.IsReachingAlignedBarrierOnly =
false;
3061 bool Changed =
false;
3062 auto &FnED = BEDMap[
nullptr];
3063 Changed |= setAndRecord(FnED.IsReachedFromAlignedBarrierOnly,
3064 FnED.IsReachedFromAlignedBarrierOnly &
3065 EntryBBED.IsReachedFromAlignedBarrierOnly);
3066 Changed |= setAndRecord(FnED.IsReachingAlignedBarrierOnly,
3067 FnED.IsReachingAlignedBarrierOnly &
3068 ExitED.IsReachingAlignedBarrierOnly);
3069 Changed |= setAndRecord(FnED.IsExecutedByInitialThreadOnly,
3070 EntryBBED.IsExecutedByInitialThreadOnly);
3076 bool Changed =
false;
3081 auto HandleAlignedBarrier = [&](
CallBase &CB, ExecutionDomainTy &ED) {
3082 Changed |= AlignedBarriers.insert(&CB);
3084 auto &CallInED = CEDMap[{&CB, PRE}];
3085 Changed |= mergeInPredecessor(
A, CallInED, ED);
3086 CallInED.IsReachingAlignedBarrierOnly =
true;
3088 ED.EncounteredNonLocalSideEffect =
false;
3089 ED.IsReachedFromAlignedBarrierOnly =
true;
3091 ED.clearAssumeInstAndAlignedBarriers();
3092 ED.addAlignedBarrier(
A, CB);
3093 auto &CallOutED = CEDMap[{&CB, POST}];
3094 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3098 A.getAAFor<
AAIsDead>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
3105 for (
auto &RIt : *RPOT) {
3108 bool IsEntryBB = &BB == &EntryBB;
3111 bool AlignedBarrierLastInBlock = IsEntryBB && IsKernel;
3112 bool IsExplicitlyAligned = IsEntryBB && IsKernel;
3113 ExecutionDomainTy ED;
3116 Changed |= handleCallees(
A, ED);
3120 if (LivenessAA && LivenessAA->isAssumedDead(&BB))
3124 if (LivenessAA && LivenessAA->isEdgeDead(PredBB, &BB))
3126 bool InitialEdgeOnly = isInitialThreadOnlyEdge(
3127 A, dyn_cast<BranchInst>(PredBB->getTerminator()), BB);
3128 mergeInPredecessor(
A, ED, BEDMap[PredBB], InitialEdgeOnly);
3135 bool UsedAssumedInformation;
3136 if (
A.isAssumedDead(
I, *
this, LivenessAA, UsedAssumedInformation,
3137 false, DepClassTy::OPTIONAL,
3143 if (
auto *
II = dyn_cast<IntrinsicInst>(&
I)) {
3144 if (
auto *AI = dyn_cast_or_null<AssumeInst>(
II)) {
3145 ED.addAssumeInst(
A, *AI);
3149 if (
II->isAssumeLikeIntrinsic())
3153 if (
auto *FI = dyn_cast<FenceInst>(&
I)) {
3154 if (!ED.EncounteredNonLocalSideEffect) {
3156 if (ED.IsReachedFromAlignedBarrierOnly)
3161 case AtomicOrdering::NotAtomic:
3163 case AtomicOrdering::Unordered:
3165 case AtomicOrdering::Monotonic:
3167 case AtomicOrdering::Acquire:
3169 case AtomicOrdering::Release:
3171 case AtomicOrdering::AcquireRelease:
3173 case AtomicOrdering::SequentiallyConsistent:
3177 NonNoOpFences.insert(FI);
3180 auto *CB = dyn_cast<CallBase>(&
I);
3182 bool IsAlignedBarrier =
3186 AlignedBarrierLastInBlock &= IsNoSync;
3187 IsExplicitlyAligned &= IsNoSync;
3193 if (IsAlignedBarrier) {
3194 HandleAlignedBarrier(*CB, ED);
3195 AlignedBarrierLastInBlock =
true;
3196 IsExplicitlyAligned =
true;
3201 if (isa<MemIntrinsic>(&
I)) {
3202 if (!ED.EncounteredNonLocalSideEffect &&
3204 ED.EncounteredNonLocalSideEffect =
true;
3206 ED.IsReachedFromAlignedBarrierOnly =
false;
3214 auto &CallInED = CEDMap[{CB, PRE}];
3215 Changed |= mergeInPredecessor(
A, CallInED, ED);
3221 if (!IsNoSync && Callee && !
Callee->isDeclaration()) {
3224 if (EDAA && EDAA->getState().isValidState()) {
3227 CalleeED.IsReachedFromAlignedBarrierOnly;
3228 AlignedBarrierLastInBlock = ED.IsReachedFromAlignedBarrierOnly;
3229 if (IsNoSync || !CalleeED.IsReachedFromAlignedBarrierOnly)
3230 ED.EncounteredNonLocalSideEffect |=
3231 CalleeED.EncounteredNonLocalSideEffect;
3233 ED.EncounteredNonLocalSideEffect =
3234 CalleeED.EncounteredNonLocalSideEffect;
3235 if (!CalleeED.IsReachingAlignedBarrierOnly) {
3237 setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3240 if (CalleeED.IsReachedFromAlignedBarrierOnly)
3241 mergeInPredecessorBarriersAndAssumptions(
A, ED, CalleeED);
3242 auto &CallOutED = CEDMap[{CB, POST}];
3243 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3248 ED.IsReachedFromAlignedBarrierOnly =
false;
3249 Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3252 AlignedBarrierLastInBlock &= ED.IsReachedFromAlignedBarrierOnly;
3254 auto &CallOutED = CEDMap[{CB, POST}];
3255 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3258 if (!
I.mayHaveSideEffects() && !
I.mayReadFromMemory())
3272 if (MemAA && MemAA->getState().isValidState() &&
3273 MemAA->checkForAllAccessesToMemoryKind(
3278 auto &InfoCache =
A.getInfoCache();
3279 if (!
I.mayHaveSideEffects() && InfoCache.isOnlyUsedByAssume(
I))
3282 if (
auto *LI = dyn_cast<LoadInst>(&
I))
3283 if (LI->hasMetadata(LLVMContext::MD_invariant_load))
3286 if (!ED.EncounteredNonLocalSideEffect &&
3288 ED.EncounteredNonLocalSideEffect =
true;
3291 bool IsEndAndNotReachingAlignedBarriersOnly =
false;
3292 if (!isa<UnreachableInst>(BB.getTerminator()) &&
3293 !BB.getTerminator()->getNumSuccessors()) {
3295 Changed |= mergeInPredecessor(
A, InterProceduralED, ED);
3297 auto &FnED = BEDMap[
nullptr];
3298 if (IsKernel && !IsExplicitlyAligned)
3299 FnED.IsReachingAlignedBarrierOnly =
false;
3300 Changed |= mergeInPredecessor(
A, FnED, ED);
3302 if (!FnED.IsReachingAlignedBarrierOnly) {
3303 IsEndAndNotReachingAlignedBarriersOnly =
true;
3304 SyncInstWorklist.
push_back(BB.getTerminator());
3305 auto &BBED = BEDMap[&BB];
3306 Changed |= setAndRecord(BBED.IsReachingAlignedBarrierOnly,
false);
3310 ExecutionDomainTy &StoredED = BEDMap[&BB];
3311 ED.IsReachingAlignedBarrierOnly = StoredED.IsReachingAlignedBarrierOnly &
3312 !IsEndAndNotReachingAlignedBarriersOnly;
3318 if (ED.IsExecutedByInitialThreadOnly !=
3319 StoredED.IsExecutedByInitialThreadOnly ||
3320 ED.IsReachedFromAlignedBarrierOnly !=
3321 StoredED.IsReachedFromAlignedBarrierOnly ||
3322 ED.EncounteredNonLocalSideEffect !=
3323 StoredED.EncounteredNonLocalSideEffect)
3327 StoredED = std::move(ED);
3333 while (!SyncInstWorklist.
empty()) {
3336 bool HitAlignedBarrierOrKnownEnd =
false;
3338 auto *CB = dyn_cast<CallBase>(CurInst);
3341 auto &CallOutED = CEDMap[{CB, POST}];
3342 Changed |= setAndRecord(CallOutED.IsReachingAlignedBarrierOnly,
false);
3343 auto &CallInED = CEDMap[{CB, PRE}];
3344 HitAlignedBarrierOrKnownEnd =
3345 AlignedBarriers.count(CB) || !CallInED.IsReachingAlignedBarrierOnly;
3346 if (HitAlignedBarrierOrKnownEnd)
3348 Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3350 if (HitAlignedBarrierOrKnownEnd)
3354 if (LivenessAA && LivenessAA->isEdgeDead(PredBB, SyncBB))
3356 if (!Visited.
insert(PredBB))
3358 auto &PredED = BEDMap[PredBB];
3359 if (setAndRecord(PredED.IsReachingAlignedBarrierOnly,
false)) {
3361 SyncInstWorklist.
push_back(PredBB->getTerminator());
3364 if (SyncBB != &EntryBB)
3367 setAndRecord(InterProceduralED.IsReachingAlignedBarrierOnly,
false);
3370 return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
3375struct AAHeapToShared :
public StateWrapper<BooleanState, AbstractAttribute> {
3380 static AAHeapToShared &createForPosition(
const IRPosition &IRP,
3384 virtual bool isAssumedHeapToShared(
CallBase &CB)
const = 0;
3388 virtual bool isAssumedHeapToSharedRemovedFree(
CallBase &CB)
const = 0;
3391 const std::string
getName()
const override {
return "AAHeapToShared"; }
3394 const char *getIdAddr()
const override {
return &
ID; }
3403 static const char ID;
3406struct AAHeapToSharedFunction :
public AAHeapToShared {
3408 : AAHeapToShared(IRP,
A) {}
3410 const std::string getAsStr(
Attributor *)
const override {
3411 return "[AAHeapToShared] " + std::to_string(MallocCalls.size()) +
3412 " malloc calls eligible.";
3416 void trackStatistics()
const override {}
3420 void findPotentialRemovedFreeCalls(
Attributor &
A) {
3421 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3422 auto &FreeRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
3424 PotentialRemovedFreeCalls.clear();
3428 for (
auto *U : CB->
users()) {
3430 if (
C &&
C->getCalledFunction() == FreeRFI.Declaration)
3434 if (FreeCalls.
size() != 1)
3437 PotentialRemovedFreeCalls.insert(FreeCalls.
front());
3443 indicatePessimisticFixpoint();
3447 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3448 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3449 if (!RFI.Declaration)
3454 bool &) -> std::optional<Value *> {
return nullptr; };
3457 for (
User *U : RFI.Declaration->
users())
3458 if (
CallBase *CB = dyn_cast<CallBase>(U)) {
3461 MallocCalls.insert(CB);
3466 findPotentialRemovedFreeCalls(
A);
3469 bool isAssumedHeapToShared(
CallBase &CB)
const override {
3470 return isValidState() && MallocCalls.count(&CB);
3473 bool isAssumedHeapToSharedRemovedFree(
CallBase &CB)
const override {
3474 return isValidState() && PotentialRemovedFreeCalls.count(&CB);
3478 if (MallocCalls.empty())
3479 return ChangeStatus::UNCHANGED;
3481 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3482 auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
3486 DepClassTy::OPTIONAL);
3491 if (HS &&
HS->isAssumedHeapToStack(*CB))
3496 for (
auto *U : CB->
users()) {
3498 if (
C &&
C->getCalledFunction() == FreeCall.Declaration)
3501 if (FreeCalls.
size() != 1)
3508 <<
" with shared memory."
3509 <<
" Shared memory usage is limited to "
3515 <<
" with " << AllocSize->getZExtValue()
3516 <<
" bytes of shared memory\n");
3522 Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue());
3527 static_cast<unsigned>(AddressSpace::Shared));
3529 SharedMem, PointerType::getUnqual(
M->getContext()));
3532 return OR <<
"Replaced globalized variable with "
3533 <<
ore::NV(
"SharedMemory", AllocSize->getZExtValue())
3534 << (AllocSize->isOne() ?
" byte " :
" bytes ")
3535 <<
"of shared memory.";
3541 "HeapToShared on allocation without alignment attribute");
3542 SharedMem->setAlignment(*Alignment);
3545 A.deleteAfterManifest(*CB);
3546 A.deleteAfterManifest(*FreeCalls.
front());
3548 SharedMemoryUsed += AllocSize->getZExtValue();
3549 NumBytesMovedToSharedMemory = SharedMemoryUsed;
3550 Changed = ChangeStatus::CHANGED;
3557 if (MallocCalls.empty())
3558 return indicatePessimisticFixpoint();
3559 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3560 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3561 if (!RFI.Declaration)
3562 return ChangeStatus::UNCHANGED;
3566 auto NumMallocCalls = MallocCalls.size();
3569 for (
User *U : RFI.Declaration->
users()) {
3570 if (
CallBase *CB = dyn_cast<CallBase>(U)) {
3571 if (CB->getCaller() !=
F)
3573 if (!MallocCalls.count(CB))
3575 if (!isa<ConstantInt>(CB->getArgOperand(0))) {
3576 MallocCalls.remove(CB);
3581 if (!ED || !ED->isExecutedByInitialThreadOnly(*CB))
3582 MallocCalls.remove(CB);
3586 findPotentialRemovedFreeCalls(
A);
3588 if (NumMallocCalls != MallocCalls.size())
3589 return ChangeStatus::CHANGED;
3591 return ChangeStatus::UNCHANGED;
3599 unsigned SharedMemoryUsed = 0;
3602struct AAKernelInfo :
public StateWrapper<KernelInfoState, AbstractAttribute> {
3608 static bool requiresCalleeForCallBase() {
return false; }
3611 void trackStatistics()
const override {}
3614 const std::string getAsStr(
Attributor *)
const override {
3615 if (!isValidState())
3617 return std::string(SPMDCompatibilityTracker.isAssumed() ?
"SPMD"
3619 std::string(SPMDCompatibilityTracker.isAtFixpoint() ?
" [FIX]"
3621 std::string(
" #PRs: ") +
3622 (ReachedKnownParallelRegions.isValidState()
3623 ? std::to_string(ReachedKnownParallelRegions.size())
3625 ", #Unknown PRs: " +
3626 (ReachedUnknownParallelRegions.isValidState()
3629 ", #Reaching Kernels: " +
3630 (ReachingKernelEntries.isValidState()
3634 (ParallelLevels.isValidState()
3637 ", NestedPar: " + (NestedParallelism ?
"yes" :
"no");
3644 const std::string
getName()
const override {
return "AAKernelInfo"; }
3647 const char *getIdAddr()
const override {
return &
ID; }
3654 static const char ID;
3659struct AAKernelInfoFunction : AAKernelInfo {
3661 : AAKernelInfo(IRP,
A) {}
3666 return GuardedInstructions;
3669 void setConfigurationOfKernelEnvironment(
ConstantStruct *ConfigC) {
3671 KernelEnvC, ConfigC, {KernelInfo::ConfigurationIdx});
3672 assert(NewKernelEnvC &&
"Failed to create new kernel environment");
3673 KernelEnvC = cast<ConstantStruct>(NewKernelEnvC);
3676#define KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MEMBER) \
3677 void set##MEMBER##OfKernelEnvironment(ConstantInt *NewVal) { \
3678 ConstantStruct *ConfigC = \
3679 KernelInfo::getConfigurationFromKernelEnvironment(KernelEnvC); \
3680 Constant *NewConfigC = ConstantFoldInsertValueInstruction( \
3681 ConfigC, NewVal, {KernelInfo::MEMBER##Idx}); \
3682 assert(NewConfigC && "Failed to create new configuration environment"); \
3683 setConfigurationOfKernelEnvironment(cast<ConstantStruct>(NewConfigC)); \
3694#undef KERNEL_ENVIRONMENT_CONFIGURATION_SETTER
3701 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3705 OMPInformationCache::RuntimeFunctionInfo &InitRFI =
3706 OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
3707 OMPInformationCache::RuntimeFunctionInfo &DeinitRFI =
3708 OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];
3712 auto StoreCallBase = [](
Use &U,
3713 OMPInformationCache::RuntimeFunctionInfo &RFI,
3715 CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI);
3717 "Unexpected use of __kmpc_target_init or __kmpc_target_deinit!");
3719 "Multiple uses of __kmpc_target_init or __kmpc_target_deinit!");
3725 StoreCallBase(U, InitRFI, KernelInitCB);
3729 DeinitRFI.foreachUse(
3731 StoreCallBase(U, DeinitRFI, KernelDeinitCB);
3737 if (!KernelInitCB || !KernelDeinitCB)
3741 ReachingKernelEntries.insert(Fn);
3742 IsKernelEntry =
true;
3750 KernelConfigurationSimplifyCB =
3752 bool &UsedAssumedInformation) -> std::optional<Constant *> {
3753 if (!isAtFixpoint()) {
3756 UsedAssumedInformation =
true;
3757 A.recordDependence(*
this, *AA, DepClassTy::OPTIONAL);
3762 A.registerGlobalVariableSimplificationCallback(
3763 *KernelEnvGV, KernelConfigurationSimplifyCB);
3766 bool CanChangeToSPMD = OMPInfoCache.runtimeFnsAvailable(
3767 {OMPRTL___kmpc_get_hardware_thread_id_in_block,
3768 OMPRTL___kmpc_barrier_simple_spmd});
3772 KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);
3777 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
3781 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3783 setExecModeOfKernelEnvironment(AssumedExecModeC);
3790 setMinThreadsOfKernelEnvironment(ConstantInt::get(Int32Ty, MinThreads));
3792 setMaxThreadsOfKernelEnvironment(ConstantInt::get(Int32Ty,
MaxThreads));
3793 auto [MinTeams, MaxTeams] =
3796 setMinTeamsOfKernelEnvironment(ConstantInt::get(Int32Ty, MinTeams));
3798 setMaxTeamsOfKernelEnvironment(ConstantInt::get(Int32Ty, MaxTeams));
3801 KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(KernelEnvC);
3802 ConstantInt *AssumedMayUseNestedParallelismC = ConstantInt::get(
3804 setMayUseNestedParallelismOfKernelEnvironment(
3805 AssumedMayUseNestedParallelismC);
3809 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
3812 ConstantInt::get(UseGenericStateMachineC->
getIntegerType(),
false);
3813 setUseGenericStateMachineOfKernelEnvironment(
3814 AssumedUseGenericStateMachineC);
3820 if (!OMPInfoCache.RFIs[RFKind].Declaration)
3822 A.registerVirtualUseCallback(*OMPInfoCache.RFIs[RFKind].Declaration, CB);
3826 auto AddDependence = [](
Attributor &
A,
const AAKernelInfo *KI,
3829 A.recordDependence(*KI, *QueryingAA, DepClassTy::OPTIONAL);
3843 if (SPMDCompatibilityTracker.isValidState())
3844 return AddDependence(
A,
this, QueryingAA);
3846 if (!ReachedKnownParallelRegions.isValidState())
3847 return AddDependence(
A,
this, QueryingAA);
3853 RegisterVirtualUse(OMPRTL___kmpc_get_hardware_num_threads_in_block,
3854 CustomStateMachineUseCB);
3855 RegisterVirtualUse(OMPRTL___kmpc_get_warp_size, CustomStateMachineUseCB);
3856 RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_generic,
3857 CustomStateMachineUseCB);
3858 RegisterVirtualUse(OMPRTL___kmpc_kernel_parallel,
3859 CustomStateMachineUseCB);
3860 RegisterVirtualUse(OMPRTL___kmpc_kernel_end_parallel,
3861 CustomStateMachineUseCB);
3865 if (SPMDCompatibilityTracker.isAtFixpoint())
3872 if (!SPMDCompatibilityTracker.isValidState())
3873 return AddDependence(
A,
this, QueryingAA);
3876 RegisterVirtualUse(OMPRTL___kmpc_get_hardware_thread_id_in_block,
3885 if (!SPMDCompatibilityTracker.isValidState())
3886 return AddDependence(
A,
this, QueryingAA);
3887 if (SPMDCompatibilityTracker.empty())
3888 return AddDependence(
A,
this, QueryingAA);
3889 if (!mayContainParallelRegion())
3890 return AddDependence(
A,
this, QueryingAA);
3893 RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_spmd, SPMDBarrierUseCB);
3897 static std::string sanitizeForGlobalName(std::string S) {
3901 return !((C >=
'a' && C <=
'z') || (C >=
'A' && C <=
'Z') ||
3902 (C >=
'0' && C <=
'9') || C ==
'_');
3913 if (!KernelInitCB || !KernelDeinitCB)
3914 return ChangeStatus::UNCHANGED;
3918 bool HasBuiltStateMachine =
true;
3919 if (!changeToSPMDMode(
A, Changed)) {
3921 HasBuiltStateMachine = buildCustomStateMachine(
A, Changed);
3923 HasBuiltStateMachine =
false;
3930 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
3931 ExistingKernelEnvC);
3932 if (!HasBuiltStateMachine)
3933 setUseGenericStateMachineOfKernelEnvironment(
3934 OldUseGenericStateMachineVal);
3941 Changed = ChangeStatus::CHANGED;
3947 void insertInstructionGuardsHelper(
Attributor &
A) {
3948 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3950 auto CreateGuardedRegion = [&](
Instruction *RegionStartI,
3984 DT, LI, MSU,
"region.guarded.end");
3987 MSU,
"region.barrier");
3990 DT, LI, MSU,
"region.exit");
3992 SplitBlock(ParentBB, RegionStartI, DT, LI, MSU,
"region.guarded");
3995 "Expected a different CFG");
3998 ParentBB, ParentBB->
getTerminator(), DT, LI, MSU,
"region.check.tid");
4001 A.registerManifestAddedBasicBlock(*RegionEndBB);
4002 A.registerManifestAddedBasicBlock(*RegionBarrierBB);
4003 A.registerManifestAddedBasicBlock(*RegionExitBB);
4004 A.registerManifestAddedBasicBlock(*RegionStartBB);
4005 A.registerManifestAddedBasicBlock(*RegionCheckTidBB);
4007 bool HasBroadcastValues =
false;
4012 for (
Use &U :
I.uses()) {
4018 if (OutsideUses.
empty())
4021 HasBroadcastValues =
true;
4026 M,
I.getType(),
false,
4028 sanitizeForGlobalName(
4029 (
I.getName() +
".guarded.output.alloc").str()),
4031 static_cast<unsigned>(AddressSpace::Shared));
4038 I.getType(), SharedMem,
I.getName() +
".guarded.output.load",
4042 for (
Use *U : OutsideUses)
4043 A.changeUseAfterManifest(*U, *LoadI);
4046 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4052 InsertPointTy(ParentBB, ParentBB->
end()),
DL);
4053 OMPInfoCache.OMPBuilder.updateToLocation(Loc);
4056 OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4058 OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4064 InsertPointTy(RegionCheckTidBB, RegionCheckTidBB->
end()),
DL);
4065 OMPInfoCache.OMPBuilder.updateToLocation(LocRegionCheckTid);
4067 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4068 M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
4070 OMPInfoCache.OMPBuilder.Builder.CreateCall(HardwareTidFn, {});
4072 OMPInfoCache.setCallingConvention(HardwareTidFn, Tid);
4073 Value *TidCheck = OMPInfoCache.OMPBuilder.Builder.CreateIsNull(Tid);
4074 OMPInfoCache.OMPBuilder.Builder
4075 .CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB)
4081 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4082 M, OMPRTL___kmpc_barrier_simple_spmd);
4083 OMPInfoCache.OMPBuilder.updateToLocation(InsertPointTy(
4086 OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid});
4088 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4091 if (HasBroadcastValues) {
4096 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4100 auto &AllocSharedRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
4102 for (
Instruction *GuardedI : SPMDCompatibilityTracker) {
4104 if (!Visited.
insert(BB).second)
4110 while (++IP != IPEnd) {
4111 if (!IP->mayHaveSideEffects() && !IP->mayReadFromMemory())
4114 if (OpenMPOpt::getCallIfRegularCall(*
I, &AllocSharedRFI))
4116 if (!
I->user_empty() || !SPMDCompatibilityTracker.contains(
I)) {
4117 LastEffect =
nullptr;
4124 for (
auto &Reorder : Reorders)
4130 for (
Instruction *GuardedI : SPMDCompatibilityTracker) {
4132 auto *CalleeAA =
A.lookupAAFor<AAKernelInfo>(
4135 assert(CalleeAA !=
nullptr &&
"Expected Callee AAKernelInfo");
4136 auto &CalleeAAFunction = *cast<AAKernelInfoFunction>(CalleeAA);
4138 if (CalleeAAFunction.getGuardedInstructions().contains(GuardedI))
4141 Instruction *GuardedRegionStart =
nullptr, *GuardedRegionEnd =
nullptr;
4145 if (SPMDCompatibilityTracker.contains(&
I)) {
4146 CalleeAAFunction.getGuardedInstructions().insert(&
I);
4147 if (GuardedRegionStart)
4148 GuardedRegionEnd = &
I;
4150 GuardedRegionStart = GuardedRegionEnd = &
I;
4157 if (GuardedRegionStart) {
4159 std::make_pair(GuardedRegionStart, GuardedRegionEnd));
4160 GuardedRegionStart =
nullptr;
4161 GuardedRegionEnd =
nullptr;
4166 for (
auto &GR : GuardedRegions)
4167 CreateGuardedRegion(GR.first, GR.second);
4170 void forceSingleThreadPerWorkgroupHelper(
Attributor &
A) {
4179 auto &Ctx = getAnchorValue().getContext();
4186 KernelInitCB->
getNextNode(),
"main.thread.user_code");
4191 A.registerManifestAddedBasicBlock(*InitBB);
4192 A.registerManifestAddedBasicBlock(*UserCodeBB);
4193 A.registerManifestAddedBasicBlock(*ReturnBB);
4202 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4204 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4205 M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
4210 OMPInfoCache.setCallingConvention(ThreadIdInBlockFn, ThreadIdInBlock);
4216 ConstantInt::get(ThreadIdInBlock->
getType(), 0),
4217 "thread.is_main", InitBB);
4223 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4225 if (!SPMDCompatibilityTracker.isAssumed()) {
4226 for (
Instruction *NonCompatibleI : SPMDCompatibilityTracker) {
4227 if (!NonCompatibleI)
4231 if (
auto *CB = dyn_cast<CallBase>(NonCompatibleI))
4232 if (OMPInfoCache.RTLFunctions.contains(CB->getCalledFunction()))
4236 ORA <<
"Value has potential side effects preventing SPMD-mode "
4238 if (isa<CallBase>(NonCompatibleI)) {
4239 ORA <<
". Add `[[omp::assume(\"ompx_spmd_amenable\")]]` to "
4240 "the called function to override";
4248 << *NonCompatibleI <<
"\n");
4260 Kernel = CB->getCaller();
4268 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC);
4274 Changed = ChangeStatus::CHANGED;
4278 if (mayContainParallelRegion())
4279 insertInstructionGuardsHelper(
A);
4281 forceSingleThreadPerWorkgroupHelper(
A);
4286 "Initially non-SPMD kernel has SPMD exec mode!");
4287 setExecModeOfKernelEnvironment(
4291 ++NumOpenMPTargetRegionKernelsSPMD;
4294 return OR <<
"Transformed generic-mode kernel to SPMD-mode.";
4306 if (!ReachedKnownParallelRegions.isValidState())
4309 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4310 if (!OMPInfoCache.runtimeFnsAvailable(
4311 {OMPRTL___kmpc_get_hardware_num_threads_in_block,
4312 OMPRTL___kmpc_get_warp_size, OMPRTL___kmpc_barrier_simple_generic,
4313 OMPRTL___kmpc_kernel_parallel, OMPRTL___kmpc_kernel_end_parallel}))
4324 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
4325 ExistingKernelEnvC);
4327 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC);
4332 if (UseStateMachineC->
isZero() ||
4336 Changed = ChangeStatus::CHANGED;
4339 setUseGenericStateMachineOfKernelEnvironment(
4346 if (!mayContainParallelRegion()) {
4347 ++NumOpenMPTargetRegionKernelsWithoutStateMachine;
4350 return OR <<
"Removing unused state machine from generic-mode kernel.";
4358 if (ReachedUnknownParallelRegions.empty()) {
4359 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback;
4362 return OR <<
"Rewriting generic-mode kernel with a customized state "
4367 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback;
4370 return OR <<
"Generic-mode kernel is executed with a customized state "
4371 "machine that requires a fallback.";
4376 for (
CallBase *UnknownParallelRegionCB : ReachedUnknownParallelRegions) {
4377 if (!UnknownParallelRegionCB)
4380 return ORA <<
"Call may contain unknown parallel regions. Use "
4381 <<
"`[[omp::assume(\"omp_no_parallelism\")]]` to "
4419 auto &Ctx = getAnchorValue().getContext();
4423 BasicBlock *InitBB = KernelInitCB->getParent();
4425 KernelInitCB->getNextNode(),
"thread.user_code.check");
4429 Ctx,
"worker_state_machine.begin",
Kernel, UserCodeEntryBB);
4431 Ctx,
"worker_state_machine.finished",
Kernel, UserCodeEntryBB);
4433 Ctx,
"worker_state_machine.is_active.check",
Kernel, UserCodeEntryBB);
4436 Kernel, UserCodeEntryBB);
4439 Kernel, UserCodeEntryBB);
4441 Ctx,
"worker_state_machine.done.barrier",
Kernel, UserCodeEntryBB);
4442 A.registerManifestAddedBasicBlock(*InitBB);
4443 A.registerManifestAddedBasicBlock(*UserCodeEntryBB);
4444 A.registerManifestAddedBasicBlock(*IsWorkerCheckBB);
4445 A.registerManifestAddedBasicBlock(*StateMachineBeginBB);
4446 A.registerManifestAddedBasicBlock(*StateMachineFinishedBB);
4447 A.registerManifestAddedBasicBlock(*StateMachineIsActiveCheckBB);
4448 A.registerManifestAddedBasicBlock(*StateMachineIfCascadeCurrentBB);
4449 A.registerManifestAddedBasicBlock(*StateMachineEndParallelBB);
4450 A.registerManifestAddedBasicBlock(*StateMachineDoneBarrierBB);
4452 const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
4458 ConstantInt::get(KernelInitCB->getType(), -1),
4459 "thread.is_worker", InitBB);
4465 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4466 M, OMPRTL___kmpc_get_hardware_num_threads_in_block);
4468 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4469 M, OMPRTL___kmpc_get_warp_size);
4472 OMPInfoCache.setCallingConvention(BlockHwSizeFn, BlockHwSize);
4476 OMPInfoCache.setCallingConvention(WarpSizeFn, WarpSize);
4479 BlockHwSize, WarpSize,
"block.size", IsWorkerCheckBB);
4483 "thread.is_main_or_worker", IsWorkerCheckBB);
4486 IsMainOrWorker, IsWorkerCheckBB);
4490 Type *VoidPtrTy = PointerType::getUnqual(Ctx);
4492 new AllocaInst(VoidPtrTy,
DL.getAllocaAddrSpace(),
nullptr,
4496 OMPInfoCache.OMPBuilder.updateToLocation(
4499 StateMachineBeginBB->
end()),
4502 Value *Ident = KernelInfo::getIdentFromKernelEnvironment(KernelEnvC);
4503 Value *GTid = KernelInitCB;
4506 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4507 M, OMPRTL___kmpc_barrier_simple_generic);
4510 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4514 (
unsigned int)AddressSpace::Generic) {
4516 WorkFnAI, PointerType::get(Ctx, (
unsigned int)AddressSpace::Generic),
4517 WorkFnAI->
getName() +
".generic", StateMachineBeginBB);
4522 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4523 M, OMPRTL___kmpc_kernel_parallel);
4525 KernelParallelFn, {WorkFnAI},
"worker.is_active", StateMachineBeginBB);
4526 OMPInfoCache.setCallingConvention(KernelParallelFn, IsActiveWorker);
4529 StateMachineBeginBB);
4539 StateMachineBeginBB);
4540 IsDone->setDebugLoc(DLoc);
4542 IsDone, StateMachineBeginBB)
4546 StateMachineDoneBarrierBB, IsActiveWorker,
4547 StateMachineIsActiveCheckBB)
4553 const unsigned int WrapperFunctionArgNo = 6;
4558 for (
int I = 0, E = ReachedKnownParallelRegions.size();
I < E; ++
I) {
4559 auto *CB = ReachedKnownParallelRegions[
I];
4560 auto *ParallelRegion = dyn_cast<Function>(
4561 CB->getArgOperand(WrapperFunctionArgNo)->stripPointerCasts());
4563 Ctx,
"worker_state_machine.parallel_region.execute",
Kernel,
4564 StateMachineEndParallelBB);
4566 ->setDebugLoc(DLoc);
4572 Kernel, StateMachineEndParallelBB);
4573 A.registerManifestAddedBasicBlock(*PRExecuteBB);
4574 A.registerManifestAddedBasicBlock(*PRNextBB);
4579 if (
I + 1 < E || !ReachedUnknownParallelRegions.empty()) {
4582 "worker.check_parallel_region", StateMachineIfCascadeCurrentBB);
4590 StateMachineIfCascadeCurrentBB)
4592 StateMachineIfCascadeCurrentBB = PRNextBB;
4598 if (!ReachedUnknownParallelRegions.empty()) {
4599 StateMachineIfCascadeCurrentBB->
setName(
4600 "worker_state_machine.parallel_region.fallback.execute");
4602 StateMachineIfCascadeCurrentBB)
4603 ->setDebugLoc(DLoc);
4606 StateMachineIfCascadeCurrentBB)
4610 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4611 M, OMPRTL___kmpc_kernel_end_parallel);
4614 OMPInfoCache.setCallingConvention(EndParallelFn, EndParallel);
4620 ->setDebugLoc(DLoc);
4630 KernelInfoState StateBefore = getState();
4636 struct UpdateKernelEnvCRAII {
4637 AAKernelInfoFunction &AA;
4639 UpdateKernelEnvCRAII(AAKernelInfoFunction &AA) : AA(AA) {}
4641 ~UpdateKernelEnvCRAII() {
4648 if (!AA.isValidState()) {
4649 AA.KernelEnvC = ExistingKernelEnvC;
4653 if (!AA.ReachedKnownParallelRegions.isValidState())
4654 AA.setUseGenericStateMachineOfKernelEnvironment(
4655 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
4656 ExistingKernelEnvC));
4658 if (!AA.SPMDCompatibilityTracker.isValidState())
4659 AA.setExecModeOfKernelEnvironment(
4660 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC));
4663 KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(
4665 ConstantInt *NewMayUseNestedParallelismC = ConstantInt::get(
4666 MayUseNestedParallelismC->
getIntegerType(), AA.NestedParallelism);
4667 AA.setMayUseNestedParallelismOfKernelEnvironment(
4668 NewMayUseNestedParallelismC);
4675 if (isa<CallBase>(
I))
4678 if (!
I.mayWriteToMemory())
4680 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
4683 DepClassTy::OPTIONAL);
4686 DepClassTy::OPTIONAL);
4687 if (UnderlyingObjsAA &&
4688 UnderlyingObjsAA->forallUnderlyingObjects([&](
Value &Obj) {
4689 if (AA::isAssumedThreadLocalObject(A, Obj, *this))
4693 auto *CB = dyn_cast<CallBase>(&Obj);
4694 return CB && HS && HS->isAssumedHeapToStack(*CB);
4700 SPMDCompatibilityTracker.insert(&
I);
4704 bool UsedAssumedInformationInCheckRWInst =
false;
4705 if (!SPMDCompatibilityTracker.isAtFixpoint())
4706 if (!
A.checkForAllReadWriteInstructions(
4707 CheckRWInst, *
this, UsedAssumedInformationInCheckRWInst))
4710 bool UsedAssumedInformationFromReachingKernels =
false;
4711 if (!IsKernelEntry) {
4712 updateParallelLevels(
A);
4714 bool AllReachingKernelsKnown =
true;
4715 updateReachingKernelEntries(
A, AllReachingKernelsKnown);
4716 UsedAssumedInformationFromReachingKernels = !AllReachingKernelsKnown;
4718 if (!SPMDCompatibilityTracker.empty()) {
4719 if (!ParallelLevels.isValidState())
4720 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4721 else if (!ReachingKernelEntries.isValidState())
4722 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4728 for (
auto *
Kernel : ReachingKernelEntries) {
4729 auto *CBAA =
A.getAAFor<AAKernelInfo>(
4731 if (CBAA && CBAA->SPMDCompatibilityTracker.isValidState() &&
4732 CBAA->SPMDCompatibilityTracker.isAssumed())
4736 if (!CBAA || !CBAA->SPMDCompatibilityTracker.isAtFixpoint())
4737 UsedAssumedInformationFromReachingKernels =
true;
4739 if (SPMD != 0 &&
Generic != 0)
4740 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4746 bool AllParallelRegionStatesWereFixed =
true;
4747 bool AllSPMDStatesWereFixed =
true;
4749 auto &CB = cast<CallBase>(
I);
4750 auto *CBAA =
A.getAAFor<AAKernelInfo>(
4754 getState() ^= CBAA->getState();
4755 AllSPMDStatesWereFixed &= CBAA->SPMDCompatibilityTracker.isAtFixpoint();
4756 AllParallelRegionStatesWereFixed &=
4757 CBAA->ReachedKnownParallelRegions.isAtFixpoint();
4758 AllParallelRegionStatesWereFixed &=
4759 CBAA->ReachedUnknownParallelRegions.isAtFixpoint();
4763 bool UsedAssumedInformationInCheckCallInst =
false;
4764 if (!
A.checkForAllCallLikeInstructions(
4765 CheckCallInst, *
this, UsedAssumedInformationInCheckCallInst)) {
4767 <<
"Failed to visit all call-like instructions!\n";);
4768 return indicatePessimisticFixpoint();
4773 if (!UsedAssumedInformationInCheckCallInst &&
4774 AllParallelRegionStatesWereFixed) {
4775 ReachedKnownParallelRegions.indicateOptimisticFixpoint();
4776 ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
4781 if (!UsedAssumedInformationInCheckRWInst &&
4782 !UsedAssumedInformationInCheckCallInst &&
4783 !UsedAssumedInformationFromReachingKernels && AllSPMDStatesWereFixed)
4784 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
4786 return StateBefore == getState() ? ChangeStatus::UNCHANGED
4787 : ChangeStatus::CHANGED;
4793 bool &AllReachingKernelsKnown) {
4797 assert(Caller &&
"Caller is nullptr");
4799 auto *CAA =
A.getOrCreateAAFor<AAKernelInfo>(
4801 if (CAA && CAA->ReachingKernelEntries.isValidState()) {
4802 ReachingKernelEntries ^= CAA->ReachingKernelEntries;
4808 ReachingKernelEntries.indicatePessimisticFixpoint();
4813 if (!
A.checkForAllCallSites(PredCallSite, *
this,
4815 AllReachingKernelsKnown))
4816 ReachingKernelEntries.indicatePessimisticFixpoint();
4821 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4822 OMPInformationCache::RuntimeFunctionInfo &Parallel51RFI =
4823 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
4828 assert(Caller &&
"Caller is nullptr");
4832 if (CAA && CAA->ParallelLevels.isValidState()) {
4838 if (Caller == Parallel51RFI.Declaration) {
4839 ParallelLevels.indicatePessimisticFixpoint();
4843 ParallelLevels ^= CAA->ParallelLevels;
4850 ParallelLevels.indicatePessimisticFixpoint();
4855 bool AllCallSitesKnown =
true;
4856 if (!
A.checkForAllCallSites(PredCallSite, *
this,
4859 ParallelLevels.indicatePessimisticFixpoint();
4866struct AAKernelInfoCallSite : AAKernelInfo {
4868 : AAKernelInfo(IRP,
A) {}
4872 AAKernelInfo::initialize(
A);
4874 CallBase &CB = cast<CallBase>(getAssociatedValue());
4879 if (AssumptionAA && AssumptionAA->hasAssumption(
"ompx_spmd_amenable")) {
4880 indicateOptimisticFixpoint();
4888 indicateOptimisticFixpoint();
4897 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4898 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
4899 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
4901 if (!Callee || !
A.isFunctionIPOAmendable(*Callee)) {
4905 if (!AssumptionAA ||
4906 !(AssumptionAA->hasAssumption(
"omp_no_openmp") ||
4907 AssumptionAA->hasAssumption(
"omp_no_parallelism")))
4908 ReachedUnknownParallelRegions.insert(&CB);
4912 if (!SPMDCompatibilityTracker.isAtFixpoint()) {
4913 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4914 SPMDCompatibilityTracker.insert(&CB);
4919 indicateOptimisticFixpoint();
4925 if (NumCallees > 1) {
4926 indicatePessimisticFixpoint();
4933 case OMPRTL___kmpc_is_spmd_exec_mode:
4934 case OMPRTL___kmpc_distribute_static_fini:
4935 case OMPRTL___kmpc_for_static_fini:
4936 case OMPRTL___kmpc_global_thread_num:
4937 case OMPRTL___kmpc_get_hardware_num_threads_in_block:
4938 case OMPRTL___kmpc_get_hardware_num_blocks:
4939 case OMPRTL___kmpc_single:
4940 case OMPRTL___kmpc_end_single:
4941 case OMPRTL___kmpc_master:
4942 case OMPRTL___kmpc_end_master:
4943 case OMPRTL___kmpc_barrier:
4944 case OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2:
4945 case OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2:
4946 case OMPRTL___kmpc_error:
4947 case OMPRTL___kmpc_flush:
4948 case OMPRTL___kmpc_get_hardware_thread_id_in_block:
4949 case OMPRTL___kmpc_get_warp_size:
4950 case OMPRTL_omp_get_thread_num:
4951 case OMPRTL_omp_get_num_threads:
4952 case OMPRTL_omp_get_max_threads:
4953 case OMPRTL_omp_in_parallel:
4954 case OMPRTL_omp_get_dynamic:
4955 case OMPRTL_omp_get_cancellation:
4956 case OMPRTL_omp_get_nested:
4957 case OMPRTL_omp_get_schedule:
4958 case OMPRTL_omp_get_thread_limit:
4959 case OMPRTL_omp_get_supported_active_levels:
4960 case OMPRTL_omp_get_max_active_levels:
4961 case OMPRTL_omp_get_level:
4962 case OMPRTL_omp_get_ancestor_thread_num:
4963 case OMPRTL_omp_get_team_size:
4964 case OMPRTL_omp_get_active_level:
4965 case OMPRTL_omp_in_final:
4966 case OMPRTL_omp_get_proc_bind:
4967 case OMPRTL_omp_get_num_places:
4968 case OMPRTL_omp_get_num_procs:
4969 case OMPRTL_omp_get_place_proc_ids:
4970 case OMPRTL_omp_get_place_num:
4971 case OMPRTL_omp_get_partition_num_places:
4972 case OMPRTL_omp_get_partition_place_nums:
4973 case OMPRTL_omp_get_wtime:
4975 case OMPRTL___kmpc_distribute_static_init_4:
4976 case OMPRTL___kmpc_distribute_static_init_4u:
4977 case OMPRTL___kmpc_distribute_static_init_8:
4978 case OMPRTL___kmpc_distribute_static_init_8u:
4979 case OMPRTL___kmpc_for_static_init_4:
4980 case OMPRTL___kmpc_for_static_init_4u:
4981 case OMPRTL___kmpc_for_static_init_8:
4982 case OMPRTL___kmpc_for_static_init_8u: {
4984 unsigned ScheduleArgOpNo = 2;
4985 auto *ScheduleTypeCI =
4987 unsigned ScheduleTypeVal =
4988 ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0;
4990 case OMPScheduleType::UnorderedStatic:
4991 case OMPScheduleType::UnorderedStaticChunked:
4992 case OMPScheduleType::OrderedDistribute:
4993 case OMPScheduleType::OrderedDistributeChunked:
4996 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4997 SPMDCompatibilityTracker.insert(&CB);
5001 case OMPRTL___kmpc_target_init:
5004 case OMPRTL___kmpc_target_deinit:
5005 KernelDeinitCB = &CB;
5007 case OMPRTL___kmpc_parallel_51:
5008 if (!handleParallel51(
A, CB))
5009 indicatePessimisticFixpoint();
5011 case OMPRTL___kmpc_omp_task:
5013 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5014 SPMDCompatibilityTracker.insert(&CB);
5015 ReachedUnknownParallelRegions.insert(&CB);
5017 case OMPRTL___kmpc_alloc_shared:
5018 case OMPRTL___kmpc_free_shared:
5024 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5025 SPMDCompatibilityTracker.insert(&CB);
5031 indicateOptimisticFixpoint();
5035 A.getAAFor<
AACallEdges>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
5036 if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) {
5037 CheckCallee(getAssociatedFunction(), 1);
5040 const auto &OptimisticEdges = AACE->getOptimisticEdges();
5041 for (
auto *Callee : OptimisticEdges) {
5042 CheckCallee(Callee, OptimisticEdges.size());
5053 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
5054 KernelInfoState StateBefore = getState();
5056 auto CheckCallee = [&](
Function *
F,
int NumCallees) {
5057 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(
F);
5061 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
5064 A.getAAFor<AAKernelInfo>(*
this, FnPos, DepClassTy::REQUIRED);
5066 return indicatePessimisticFixpoint();
5067 if (getState() == FnAA->getState())
5068 return ChangeStatus::UNCHANGED;
5069 getState() = FnAA->getState();
5070 return ChangeStatus::CHANGED;
5073 return indicatePessimisticFixpoint();
5075 CallBase &CB = cast<CallBase>(getAssociatedValue());
5076 if (It->getSecond() == OMPRTL___kmpc_parallel_51) {
5077 if (!handleParallel51(
A, CB))
5078 return indicatePessimisticFixpoint();
5079 return StateBefore == getState() ? ChangeStatus::UNCHANGED
5080 : ChangeStatus::CHANGED;
5086 (It->getSecond() == OMPRTL___kmpc_alloc_shared ||
5087 It->getSecond() == OMPRTL___kmpc_free_shared) &&
5088 "Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call");
5092 auto *HeapToSharedAA =
A.getAAFor<AAHeapToShared>(
5100 case OMPRTL___kmpc_alloc_shared:
5101 if ((!HeapToStackAA || !HeapToStackAA->isAssumedHeapToStack(CB)) &&
5102 (!HeapToSharedAA || !HeapToSharedAA->isAssumedHeapToShared(CB)))
5103 SPMDCompatibilityTracker.insert(&CB);
5105 case OMPRTL___kmpc_free_shared:
5106 if ((!HeapToStackAA ||
5107 !HeapToStackAA->isAssumedHeapToStackRemovedFree(CB)) &&
5109 !HeapToSharedAA->isAssumedHeapToSharedRemovedFree(CB)))
5110 SPMDCompatibilityTracker.insert(&CB);
5113 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5114 SPMDCompatibilityTracker.insert(&CB);
5116 return ChangeStatus::CHANGED;
5120 A.getAAFor<
AACallEdges>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
5121 if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) {
5122 if (
Function *
F = getAssociatedFunction())
5126 for (
auto *Callee : OptimisticEdges) {
5127 CheckCallee(Callee, OptimisticEdges.size());
5133 return StateBefore == getState() ? ChangeStatus::UNCHANGED
5134 : ChangeStatus::CHANGED;
5140 const unsigned int NonWrapperFunctionArgNo = 5;
5141 const unsigned int WrapperFunctionArgNo = 6;
5142 auto ParallelRegionOpArgNo = SPMDCompatibilityTracker.isAssumed()
5143 ? NonWrapperFunctionArgNo
5144 : WrapperFunctionArgNo;
5146 auto *ParallelRegion = dyn_cast<Function>(
5148 if (!ParallelRegion)
5151 ReachedKnownParallelRegions.insert(&CB);
5153 auto *FnAA =
A.getAAFor<AAKernelInfo>(
5155 NestedParallelism |= !FnAA || !FnAA->getState().isValidState() ||
5156 !FnAA->ReachedKnownParallelRegions.empty() ||
5157 !FnAA->ReachedKnownParallelRegions.isValidState() ||
5158 !FnAA->ReachedUnknownParallelRegions.isValidState() ||
5159 !FnAA->ReachedUnknownParallelRegions.empty();
5164struct AAFoldRuntimeCall
5165 :
public StateWrapper<BooleanState, AbstractAttribute> {
5171 void trackStatistics()
const override {}
5174 static AAFoldRuntimeCall &createForPosition(
const IRPosition &IRP,
5178 const std::string
getName()
const override {
return "AAFoldRuntimeCall"; }
5181 const char *getIdAddr()
const override {
return &
ID; }
5189 static const char ID;
5192struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
5194 : AAFoldRuntimeCall(IRP,
A) {}
5197 const std::string getAsStr(
Attributor *)
const override {
5198 if (!isValidState())
5201 std::string Str(
"simplified value: ");
5203 if (!SimplifiedValue)
5204 return Str + std::string(
"none");
5206 if (!*SimplifiedValue)
5207 return Str + std::string(
"nullptr");
5209 if (
ConstantInt *CI = dyn_cast<ConstantInt>(*SimplifiedValue))
5210 return Str + std::to_string(CI->getSExtValue());
5212 return Str + std::string(
"unknown");
5217 indicatePessimisticFixpoint();
5221 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
5222 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
5223 assert(It != OMPInfoCache.RuntimeFunctionIDMap.end() &&
5224 "Expected a known OpenMP runtime function");
5226 RFKind = It->getSecond();
5228 CallBase &CB = cast<CallBase>(getAssociatedValue());
5229 A.registerSimplificationCallback(
5232 bool &UsedAssumedInformation) -> std::optional<Value *> {
5233 assert((isValidState() ||
5234 (SimplifiedValue && *SimplifiedValue ==
nullptr)) &&
5235 "Unexpected invalid state!");
5237 if (!isAtFixpoint()) {
5238 UsedAssumedInformation =
true;
5240 A.recordDependence(*
this, *AA, DepClassTy::OPTIONAL);
5242 return SimplifiedValue;
5249 case OMPRTL___kmpc_is_spmd_exec_mode:
5250 Changed |= foldIsSPMDExecMode(
A);
5252 case OMPRTL___kmpc_parallel_level:
5253 Changed |= foldParallelLevel(
A);
5255 case OMPRTL___kmpc_get_hardware_num_threads_in_block:
5256 Changed = Changed | foldKernelFnAttribute(
A,
"omp_target_thread_limit");
5258 case OMPRTL___kmpc_get_hardware_num_blocks:
5259 Changed = Changed | foldKernelFnAttribute(
A,
"omp_target_num_teams");
5271 if (SimplifiedValue && *SimplifiedValue) {
5274 A.deleteAfterManifest(
I);
5278 if (
auto *
C = dyn_cast<ConstantInt>(*SimplifiedValue))
5279 return OR <<
"Replacing OpenMP runtime call "
5281 <<
ore::NV(
"FoldedValue",
C->getZExtValue()) <<
".";
5282 return OR <<
"Replacing OpenMP runtime call "
5290 << **SimplifiedValue <<
"\n");
5292 Changed = ChangeStatus::CHANGED;
5299 SimplifiedValue =
nullptr;
5300 return AAFoldRuntimeCall::indicatePessimisticFixpoint();
5306 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5308 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
5309 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
5310 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5313 if (!CallerKernelInfoAA ||
5314 !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5315 return indicatePessimisticFixpoint();
5317 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5319 DepClassTy::REQUIRED);
5321 if (!AA || !AA->isValidState()) {
5322 SimplifiedValue =
nullptr;
5323 return indicatePessimisticFixpoint();
5326 if (AA->SPMDCompatibilityTracker.isAssumed()) {
5327 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5332 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5333 ++KnownNonSPMDCount;
5335 ++AssumedNonSPMDCount;
5339 if ((AssumedSPMDCount + KnownSPMDCount) &&
5340 (AssumedNonSPMDCount + KnownNonSPMDCount))
5341 return indicatePessimisticFixpoint();
5343 auto &Ctx = getAnchorValue().getContext();
5344 if (KnownSPMDCount || AssumedSPMDCount) {
5345 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
5346 "Expected only SPMD kernels!");
5350 }
else if (KnownNonSPMDCount || AssumedNonSPMDCount) {
5351 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
5352 "Expected only non-SPMD kernels!");
5360 assert(!SimplifiedValue &&
"SimplifiedValue should be none");
5363 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5364 : ChangeStatus::CHANGED;
5369 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5371 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5374 if (!CallerKernelInfoAA ||
5375 !CallerKernelInfoAA->ParallelLevels.isValidState())
5376 return indicatePessimisticFixpoint();
5378 if (!CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5379 return indicatePessimisticFixpoint();
5381 if (CallerKernelInfoAA->ReachingKernelEntries.empty()) {
5382 assert(!SimplifiedValue &&
5383 "SimplifiedValue should keep none at this point");
5384 return ChangeStatus::UNCHANGED;
5387 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
5388 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
5389 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5391 DepClassTy::REQUIRED);
5392 if (!AA || !AA->SPMDCompatibilityTracker.isValidState())
5393 return indicatePessimisticFixpoint();
5395 if (AA->SPMDCompatibilityTracker.isAssumed()) {
5396 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5401 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5402 ++KnownNonSPMDCount;
5404 ++AssumedNonSPMDCount;
5408 if ((AssumedSPMDCount + KnownSPMDCount) &&
5409 (AssumedNonSPMDCount + KnownNonSPMDCount))
5410 return indicatePessimisticFixpoint();
5412 auto &Ctx = getAnchorValue().getContext();
5416 if (AssumedSPMDCount || KnownSPMDCount) {
5417 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
5418 "Expected only SPMD kernels!");
5421 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
5422 "Expected only non-SPMD kernels!");
5425 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5426 : ChangeStatus::CHANGED;
5431 int32_t CurrentAttrValue = -1;
5432 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5434 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5437 if (!CallerKernelInfoAA ||
5438 !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5439 return indicatePessimisticFixpoint();
5442 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5443 int32_t NextAttrVal =
K->getFnAttributeAsParsedInteger(Attr, -1);
5445 if (NextAttrVal == -1 ||
5446 (CurrentAttrValue != -1 && CurrentAttrValue != NextAttrVal))
5447 return indicatePessimisticFixpoint();
5448 CurrentAttrValue = NextAttrVal;
5451 if (CurrentAttrValue != -1) {
5452 auto &Ctx = getAnchorValue().getContext();
5456 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5457 : ChangeStatus::CHANGED;
5463 std::optional<Value *> SimplifiedValue;
5473 auto &RFI = OMPInfoCache.RFIs[RF];
5475 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &RFI);
5478 A.getOrCreateAAFor<AAFoldRuntimeCall>(
5480 DepClassTy::NONE,
false,
5486void OpenMPOpt::registerAAs(
bool IsModulePass) {
5496 A.getOrCreateAAFor<AAKernelInfo>(
5498 DepClassTy::NONE,
false,
5502 OMPInformationCache::RuntimeFunctionInfo &InitRFI =
5503 OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
5504 InitRFI.foreachUse(SCC, CreateKernelInfoCB);
5506 registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode);
5507 registerFoldRuntimeCall(OMPRTL___kmpc_parallel_level);
5508 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_threads_in_block);
5509 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_blocks);
5514 for (
int Idx = 0;
Idx < OMPInfoCache.ICVs.size() - 1; ++
Idx) {
5517 auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
5520 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
5524 auto &CB = cast<CallBase>(*CI);
5527 A.getOrCreateAAFor<AAICVTracker>(CBPos);
5531 GetterRFI.foreachUse(SCC, CreateAA);
5540 for (
auto *
F : SCC) {
5541 if (
F->isDeclaration())
5547 if (
F->hasLocalLinkage()) {
5549 const auto *CB = dyn_cast<CallBase>(U.getUser());
5550 return CB && CB->isCallee(&U) &&
5551 A.isRunOn(const_cast<Function *>(CB->getCaller()));
5555 registerAAsForFunction(
A, *
F);
5565 if (
F.hasFnAttribute(Attribute::Convergent))
5569 if (
auto *LI = dyn_cast<LoadInst>(&
I)) {
5570 bool UsedAssumedInformation =
false;
5577 if (
auto *CI = dyn_cast<CallBase>(&
I)) {
5582 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
5588 if (
auto *FI = dyn_cast<FenceInst>(&
I)) {
5592 if (
auto *
II = dyn_cast<IntrinsicInst>(&
I)) {
5593 if (
II->getIntrinsicID() == Intrinsic::assume) {
5602const char AAICVTracker::ID = 0;
5603const char AAKernelInfo::ID = 0;
5605const char AAHeapToShared::ID = 0;
5606const char AAFoldRuntimeCall::ID = 0;
5608AAICVTracker &AAICVTracker::createForPosition(
const IRPosition &IRP,
5610 AAICVTracker *AA =
nullptr;
5618 AA =
new (
A.Allocator) AAICVTrackerFunctionReturned(IRP,
A);
5621 AA =
new (
A.Allocator) AAICVTrackerCallSiteReturned(IRP,
A);
5624 AA =
new (
A.Allocator) AAICVTrackerCallSite(IRP,
A);
5627 AA =
new (
A.Allocator) AAICVTrackerFunction(IRP,
A);
5636 AAExecutionDomainFunction *AA =
nullptr;
5646 "AAExecutionDomain can only be created for function position!");
5648 AA =
new (
A.Allocator) AAExecutionDomainFunction(IRP,
A);
5655AAHeapToShared &AAHeapToShared::createForPosition(
const IRPosition &IRP,
5657 AAHeapToSharedFunction *AA =
nullptr;
5667 "AAHeapToShared can only be created for function position!");
5669 AA =
new (
A.Allocator) AAHeapToSharedFunction(IRP,
A);
5676AAKernelInfo &AAKernelInfo::createForPosition(
const IRPosition &IRP,
5678 AAKernelInfo *AA =
nullptr;
5688 AA =
new (
A.Allocator) AAKernelInfoCallSite(IRP,
A);
5691 AA =
new (
A.Allocator) AAKernelInfoFunction(IRP,
A);
5698AAFoldRuntimeCall &AAFoldRuntimeCall::createForPosition(
const IRPosition &IRP,
5700 AAFoldRuntimeCall *AA =
nullptr;
5709 llvm_unreachable(
"KernelInfo can only be created for call site position!");
5711 AA =
new (
A.Allocator) AAFoldRuntimeCallCallSiteReturned(IRP,
A);
5732 if (Kernels.contains(&
F))
5734 for (
const User *U :
F.users())
5735 if (!isa<BlockAddress>(U))
5744 return ORA <<
"Could not internalize function. "
5745 <<
"Some optimizations may not be possible. [OMP140]";
5749 bool Changed =
false;
5757 if (!
F.isDeclaration() && !Kernels.contains(&
F) && IsCalled(
F) &&
5761 }
else if (!
F.hasLocalLinkage() && !
F.hasFnAttribute(Attribute::Cold)) {
5774 if (!
F.isDeclaration() && !InternalizedMap.
lookup(&
F)) {
5793 OMPInformationCache InfoCache(M, AG,
Allocator,
nullptr, PostLink);
5795 unsigned MaxFixpointIterations =
5807 return F.hasFnAttribute(
"kernel");
5812 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache,
A);
5813 Changed |= OMPOpt.run(
true);
5818 if (!
F.isDeclaration() && !Kernels.contains(&
F) &&
5819 !
F.hasFnAttribute(Attribute::NoInline))
5820 F.addFnAttr(Attribute::AlwaysInline);
5850 Module &M = *
C.begin()->getFunction().getParent();
5873 OMPInformationCache InfoCache(*(Functions.
back()->getParent()), AG,
Allocator,
5874 &Functions, PostLink);
5876 unsigned MaxFixpointIterations =
5890 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache,
A);
5891 bool Changed = OMPOpt.run(
false);
5908 NamedMDNode *MD = M.getNamedMetadata(
"nvvm.annotations");
5917 MDString *KindID = dyn_cast<MDString>(
Op->getOperand(1));
5918 if (!KindID || KindID->
getString() !=
"kernel")
5922 mdconst::dyn_extract_or_null<Function>(
Op->getOperand(0));
5929 ++NumOpenMPTargetRegionKernels;
5930 Kernels.insert(KernelFn);
5932 ++NumNonOpenMPTargetRegionKernels;
5939 Metadata *MD = M.getModuleFlag(
"openmp");
5947 Metadata *MD = M.getModuleFlag(
"openmp-device");
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Expand Atomic instructions
static cl::opt< unsigned > SetFixpointIterations("attributor-max-iterations", cl::Hidden, cl::desc("Maximal number of fixpoint iterations."), cl::init(32))
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
This file provides interfaces used to manipulate a call graph, regardless if it is a "old style" Call...
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
dxil pretty DXIL Metadata Pretty Printer
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines an array type that can be indexed using scoped enum values.
static void emitRemark(const Function &F, OptimizationRemarkEmitter &ORE, bool Skip)
static bool lookup(const GsymReader &GR, DataExtractor &Data, uint64_t &Offset, uint64_t BaseAddr, uint64_t Addr, SourceLocations &SrcLocs, llvm::Error &Err)
A Lookup helper functions.
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
This file defines constans and helpers used when dealing with OpenMP.
This file defines constans that will be used by both host and device compilation.
static constexpr auto TAG
static cl::opt< bool > HideMemoryTransferLatency("openmp-hide-memory-transfer-latency", cl::desc("[WIP] Tries to hide the latency of host to device memory" " transfers"), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptStateMachineRewrite("openmp-opt-disable-state-machine-rewrite", cl::desc("Disable OpenMP optimizations that replace the state machine."), cl::Hidden, cl::init(false))
static cl::opt< bool > EnableParallelRegionMerging("openmp-opt-enable-merging", cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintModuleAfterOptimizations("openmp-opt-print-module-after", cl::desc("Print the current module after OpenMP optimizations."), cl::Hidden, cl::init(false))
#define KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MEMBER)
#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX)
#define KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MEMBER)
static cl::opt< bool > PrintOpenMPKernels("openmp-print-gpu-kernels", cl::init(false), cl::Hidden)
static cl::opt< bool > DisableOpenMPOptFolding("openmp-opt-disable-folding", cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintModuleBeforeOptimizations("openmp-opt-print-module-before", cl::desc("Print the current module before OpenMP optimizations."), cl::Hidden, cl::init(false))
static cl::opt< unsigned > SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden, cl::desc("Maximal number of attributor iterations."), cl::init(256))
static cl::opt< bool > DisableInternalization("openmp-opt-disable-internalization", cl::desc("Disable function internalization."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintICVValues("openmp-print-icv-values", cl::init(false), cl::Hidden)
static cl::opt< bool > DisableOpenMPOptimizations("openmp-opt-disable", cl::desc("Disable OpenMP specific optimizations."), cl::Hidden, cl::init(false))
static cl::opt< unsigned > SharedMemoryLimit("openmp-opt-shared-limit", cl::Hidden, cl::desc("Maximum amount of shared memory to use."), cl::init(std::numeric_limits< unsigned >::max()))
static cl::opt< bool > EnableVerboseRemarks("openmp-opt-verbose-remarks", cl::desc("Enables more verbose remarks."), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptDeglobalization("openmp-opt-disable-deglobalization", cl::desc("Disable OpenMP optimizations involving deglobalization."), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptBarrierElimination("openmp-opt-disable-barrier-elimination", cl::desc("Disable OpenMP optimizations that eliminate barriers."), cl::Hidden, cl::init(false))
static cl::opt< bool > DeduceICVValues("openmp-deduce-icv-values", cl::init(false), cl::Hidden)
#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX)
#define KERNEL_ENVIRONMENT_GETTER(MEMBER, RETURNTYPE)
static cl::opt< bool > DisableOpenMPOptSPMDization("openmp-opt-disable-spmdization", cl::desc("Disable OpenMP optimizations involving SPMD-ization."), cl::Hidden, cl::init(false))
static cl::opt< bool > AlwaysInlineDeviceFunctions("openmp-opt-inline-device", cl::desc("Inline all applicable functions on the device."), cl::Hidden, cl::init(false))
FunctionAnalysisManager FAM
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static StringRef getName(Value *V)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static const int BlockSize
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, ArrayRef< StringLiteral > StandardNames)
Initialize the set of available library functions based on the specified target triple.
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
This class represents an incoming formal argument to a Function.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
AttributeSet getParamAttrs(unsigned ArgNo) const
The attributes for the argument or parameter at the given index are returned.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
reverse_iterator rbegin()
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Conditional or Unconditional Branch instruction.
bool isConditional() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Allocate memory in an ever growing pool, as if by bump-pointer.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
void setCallingConv(CallingConv::ID CC)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool doesNotAccessMemory(unsigned OpNo) const
bool isIndirectCall() const
Return true if the callsite is an indirect call.
bool isCallee(Value::const_user_iterator UI) const
Determine whether the passed iterator points to the callee operand's Use.
Value * getArgOperand(unsigned i) const
void setArgOperand(unsigned i, Value *v)
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
unsigned arg_size() const
AttributeList getAttributes() const
Return the attributes for this call.
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool isArgOperand(const Use *U) const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Function * getCaller()
Helper to get the caller (the parent function).
Wrapper to unify "old style" CallGraph and "new style" LazyCallGraph.
void initialize(LazyCallGraph &LCG, LazyCallGraph::SCC &SCC, CGSCCAnalysisManager &AM, CGSCCUpdateResult &UR)
Initializers for usage outside of a CGSCC pass, inside a CGSCC pass in the old and new pass manager (...
This class represents a function call, abstracting a target machine's calling convention.
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
@ ICMP_SLT
signed less than
static Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
This is the shared class of boolean and integer constants.
IntegerType * getIntegerType() const
Variant of the getType() method to always return an IntegerType, which reduces the amount of casting ...
static ConstantInt * getTrue(LLVMContext &Context)
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
This is an important base class in LLVM.
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
Analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const
Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...
static ErrorSuccess success()
Create a success value.
An instruction for ordering other memory operations.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this fence instruction.
A proxy from a FunctionAnalysisManager to an SCC.
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
const BasicBlock & getEntryBlock() const
const BasicBlock & front() const
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Argument * getArg(unsigned i) const
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
bool hasLocalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
@ PrivateLinkage
Like Internal, but omit from symbol table.
@ InternalLinkage
Rename collisions when linking (static functions).
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
void setInitializer(Constant *InitVal)
setInitializer - Sets the initializer for this global variable, removing any existing initializer if ...
InsertPoint - A saved insertion point.
BasicBlock * getBlock() const
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
bool isLifetimeStartOrEnd() const LLVM_READONLY
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
const Instruction * getPrevNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the previous non-debug instruction in the same basic block as 'this',...
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
bool mayReadFromMemory() const LLVM_READONLY
Return true if this instruction may read memory.
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
void setSuccessor(unsigned Idx, BasicBlock *BB)
Update the specified successor to point at the provided block.
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
A node in the call graph.
An SCC of the call graph.
A lazily constructed view of the call graph of a module.
An instruction for reading from memory.
StringRef getString() const
void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
A Module instance is used to store all the information related to an LLVM module.
const std::string & getTargetTriple() const
Get the target triple which is a string describing the target host.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
iterator_range< op_iterator > operands()
An interface to create LLVM-IR for OpenMP directives.
static std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
static std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM, LazyCallGraph &CG, CGSCCUpdateResult &UR)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
static ReturnInst * Create(LLVMContext &C, Value *retVal=nullptr, InsertPosition InsertBefore=nullptr)
A vector that has set insertion semantics.
size_type size() const
Determine the number of elements in the SetVector.
const value_type & back() const
Return the last element of the SetVector.
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Triple - Helper class for working with autoconf configuration names.
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static Type * getVoidTy(LLVMContext &C)
static IntegerType * getInt16Ty(LLVMContext &C)
static IntegerType * getInt8Ty(LLVMContext &C)
static IntegerType * getInt32Ty(LLVMContext &C)
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
void setName(const Twine &Name)
Change the name of the value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
StringRef getName() const
Return a constant reference to the value's name.
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
A raw_ostream that writes to an std::string.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
GlobalVariable * getKernelEnvironementGVFromKernelInitCB(CallBase *KernelInitCB)
ConstantStruct * getKernelEnvironementFromKernelInitCB(CallBase *KernelInitCB)
bool isValidAtPosition(const ValueAndContext &VAC, InformationCache &InfoCache)
Return true if the value of VAC is a valid at the position of VAC, that is a constant,...
bool isPotentiallyAffectedByBarrier(Attributor &A, const Instruction &I, const AbstractAttribute &QueryingAA)
Return true if I is potentially affected by a barrier.
bool isNoSyncInst(Attributor &A, const Instruction &I, const AbstractAttribute &QueryingAA)
Return true if I is a nosync instruction.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
E & operator^=(E &LHS, E RHS)
@ C
The default llvm calling convention, compatible with C.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
constexpr uint64_t PointerSize
aarch64 pointer size.
bool isOpenMPDevice(Module &M)
Helper to determine if M is a OpenMP target offloading device module.
bool containsOpenMP(Module &M)
Helper to determine if M contains OpenMP.
InternalControlVar
IDs for all Internal Control Variables (ICVs).
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
KernelSet getDeviceKernels(Module &M)
Get OpenMP device kernels in M.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
@ OMP_TGT_EXEC_MODE_GENERIC_SPMD
@ OMP_TGT_EXEC_MODE_GENERIC
bool isOpenMPKernel(Function &Fn)
Return true iff Fn is an OpenMP GPU kernel; Fn has the "kernel" attribute.
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
const_iterator end(StringRef path)
Get end iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool succ_empty(const Instruction *I)
std::string to_string(const T &Value)
bool operator!=(uint64_t V1, const APInt &V2)
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
@ FullLTOPostLink
Full LTO postlink (backend compile) phase.
@ ThinLTOPreLink
ThinLTO prelink (summary) phase.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
const char * toString(DWARFSectionKind Kind)
Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
Implement std::hash so that hash_code can be used in STL containers.
An abstract interface for address space information.
An abstract attribute for getting assumption information.
An abstract state for querying live call edges.
virtual const SetVector< Function * > & getOptimisticEdges() const =0
Get the optimistic edges.
bool IsReachedFromAlignedBarrierOnly
bool isExecutedByInitialThreadOnly(const Instruction &I) const
Check if an instruction is executed only by the initial thread.
static AAExecutionDomain & createForPosition(const IRPosition &IRP, Attributor &A)
Create an abstract attribute view for the position IRP.
virtual ExecutionDomainTy getFunctionExecutionDomain() const =0
virtual ExecutionDomainTy getExecutionDomain(const BasicBlock &) const =0
virtual bool isExecutedInAlignedRegion(Attributor &A, const Instruction &I) const =0
Check if the instruction I is executed in an aligned region, that is, the synchronizing effects befor...
virtual bool isNoOpFence(const FenceInst &FI) const =0
Helper function to determine if FI is a no-op given the information about its execution from ExecDoma...
static const char ID
Unique ID (due to the unique address)
An abstract interface for indirect call information interference.
An abstract interface for liveness abstract attribute.
An abstract interface for all memory location attributes (readnone/argmemonly/inaccessiblememonly/ina...
AccessKind
Simple enum to distinguish read/write/read-write accesses.
StateType::base_t MemoryLocationsKind
static bool isAlignedBarrier(const CallBase &CB, bool ExecutedAligned)
Helper function to determine if CB is an aligned (GPU) barrier.
An abstract Attribute for determining the necessity of the convergent attribute.
An abstract attribute for getting all assumption underlying objects.
Base struct for all "concrete attribute" deductions.
virtual ChangeStatus manifest(Attributor &A)
Hook for the Attributor to trigger the manifestation of the information represented by the abstract a...
virtual void initialize(Attributor &A)
Initialize the state with the information in the Attributor A.
virtual const std::string getAsStr(Attributor *A) const =0
This function should return the "summarized" assumed state as string.
virtual ChangeStatus updateImpl(Attributor &A)=0
The actual update/transfer function which has to be implemented by the derived classes.
virtual void trackStatistics() const =0
Hook to enable custom statistic tracking, called after manifest that resulted in a change if statisti...
virtual const char * getIdAddr() const =0
This function should return the address of the ID of the AbstractAttribute.
An interface to query the internal state of an abstract attribute.
virtual ChangeStatus indicatePessimisticFixpoint()=0
Indicate that the abstract state should converge to the pessimistic state.
virtual bool isAtFixpoint() const =0
Return if this abstract state is fixed, thus does not need to be updated if information changes as it...
virtual bool isValidState() const =0
Return if this abstract state is in a valid state.
virtual ChangeStatus indicateOptimisticFixpoint()=0
Indicate that the abstract state should converge to the optimistic state.
Wrapper for FunctionAnalysisManager.
Configuration for the Attributor.
std::function< void(Attributor &A, const Function &F)> InitializationCallback
Callback function to be invoked on internal functions marked live.
std::optional< unsigned > MaxFixpointIterations
Maximum number of iterations to run until fixpoint.
bool RewriteSignatures
Flag to determine if we rewrite function signatures.
OptimizationRemarkGetter OREGetter
IPOAmendableCBTy IPOAmendableCB
bool IsModulePass
Is the user of the Attributor a module pass or not.
bool DefaultInitializeLiveInternals
Flag to determine if we want to initialize all default AAs for an internal function marked live.
The fixpoint analysis framework that orchestrates the attribute deduction.
static bool isInternalizable(Function &F)
Returns true if the function F can be internalized.
std::function< std::optional< Constant * >(const GlobalVariable &, const AbstractAttribute *, bool &)> GlobalVariableSimplifictionCallbackTy
Register CB as a simplification callback.
std::function< bool(Attributor &, const AbstractAttribute *)> VirtualUseCallbackTy
static bool internalizeFunctions(SmallPtrSetImpl< Function * > &FnSet, DenseMap< Function *, Function * > &FnMap)
Make copies of each function in the set FnSet such that the copied version has internal linkage after...
std::function< std::optional< Value * >(const IRPosition &, const AbstractAttribute *, bool &)> SimplifictionCallbackTy
Register CB as a simplification callback.
Simple wrapper for a single bit (boolean) state.
Support structure for SCC passes to communicate updates the call graph back to the CGSCC pass manager...
Helper to describe and deal with positions in the LLVM-IR.
static const IRPosition callsite_returned(const CallBase &CB)
Create a position describing the returned value of CB.
static const IRPosition returned(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the returned value of F.
static const IRPosition value(const Value &V, const CallBaseContext *CBContext=nullptr)
Create a position describing the value of V.
static const IRPosition inst(const Instruction &I, const CallBaseContext *CBContext=nullptr)
Create a position describing the instruction I.
@ IRP_ARGUMENT
An attribute for a function argument.
@ IRP_RETURNED
An attribute for the function return value.
@ IRP_CALL_SITE
An attribute for a call site (function scope).
@ IRP_CALL_SITE_RETURNED
An attribute for a call site return value.
@ IRP_FUNCTION
An attribute for a function (scope).
@ IRP_FLOAT
A position that is not associated with a spot suitable for attributes.
@ IRP_CALL_SITE_ARGUMENT
An attribute for a call site argument.
@ IRP_INVALID
An invalid position.
static const IRPosition function(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the function scope of F.
Kind getPositionKind() const
Return the associated position kind.
static const IRPosition callsite_function(const CallBase &CB)
Create a position describing the function scope of CB.
Function * getAnchorScope() const
Return the Function surrounding the anchor value.
bool isValidState() const override
See AbstractState::isValidState() NOTE: For now we simply pretend that the worst possible state is in...
ChangeStatus indicatePessimisticFixpoint() override
See AbstractState::indicatePessimisticFixpoint(...)
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
Helper to tie a abstract state implementation to an abstract attribute.
StateType & getState() override
See AbstractAttribute::getState(...).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...