51#include "llvm/IR/IntrinsicsAMDGPU.h"
52#include "llvm/IR/IntrinsicsNVPTX.h"
68#define DEBUG_TYPE "openmp-opt"
71 "openmp-opt-disable",
cl::desc(
"Disable OpenMP specific optimizations."),
75 "openmp-opt-enable-merging",
81 cl::desc(
"Disable function internalization."),
92 "openmp-hide-memory-transfer-latency",
93 cl::desc(
"[WIP] Tries to hide the latency of host to device memory"
98 "openmp-opt-disable-deglobalization",
99 cl::desc(
"Disable OpenMP optimizations involving deglobalization."),
103 "openmp-opt-disable-spmdization",
104 cl::desc(
"Disable OpenMP optimizations involving SPMD-ization."),
108 "openmp-opt-disable-folding",
113 "openmp-opt-disable-state-machine-rewrite",
114 cl::desc(
"Disable OpenMP optimizations that replace the state machine."),
118 "openmp-opt-disable-barrier-elimination",
119 cl::desc(
"Disable OpenMP optimizations that eliminate barriers."),
123 "openmp-opt-print-module-after",
124 cl::desc(
"Print the current module after OpenMP optimizations."),
128 "openmp-opt-print-module-before",
129 cl::desc(
"Print the current module before OpenMP optimizations."),
133 "openmp-opt-inline-device",
144 cl::desc(
"Maximal number of attributor iterations."),
149 cl::desc(
"Maximum amount of shared memory to use."),
150 cl::init(std::numeric_limits<unsigned>::max()));
153 "Number of OpenMP runtime calls deduplicated");
155 "Number of OpenMP parallel regions deleted");
157 "Number of OpenMP runtime functions identified");
159 "Number of OpenMP runtime function uses identified");
161 "Number of OpenMP target region entry points (=kernels) identified");
163 "Number of non-OpenMP target region kernels identified");
165 "Number of OpenMP target region entry points (=kernels) executed in "
166 "SPMD-mode instead of generic-mode");
167STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine,
168 "Number of OpenMP target region entry points (=kernels) executed in "
169 "generic-mode without a state machines");
170STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback,
171 "Number of OpenMP target region entry points (=kernels) executed in "
172 "generic-mode with customized state machines with fallback");
173STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback,
174 "Number of OpenMP target region entry points (=kernels) executed in "
175 "generic-mode with customized state machines without fallback");
177 NumOpenMPParallelRegionsReplacedInGPUStateMachine,
178 "Number of OpenMP parallel regions replaced with ID in GPU state machines");
180 "Number of OpenMP parallel regions merged");
182 "Amount of memory pushed to shared memory");
183STATISTIC(NumBarriersEliminated,
"Number of redundant barriers eliminated");
211#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX) \
212 constexpr const unsigned MEMBER##Idx = IDX;
217#undef KERNEL_ENVIRONMENT_IDX
219#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX) \
220 constexpr const unsigned MEMBER##Idx = IDX;
230#undef KERNEL_ENVIRONMENT_CONFIGURATION_IDX
232#define KERNEL_ENVIRONMENT_GETTER(MEMBER, RETURNTYPE) \
233 RETURNTYPE *get##MEMBER##FromKernelEnvironment(ConstantStruct *KernelEnvC) { \
234 return cast<RETURNTYPE>(KernelEnvC->getAggregateElement(MEMBER##Idx)); \
240#undef KERNEL_ENVIRONMENT_GETTER
242#define KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MEMBER) \
243 ConstantInt *get##MEMBER##FromKernelEnvironment( \
244 ConstantStruct *KernelEnvC) { \
245 ConstantStruct *ConfigC = \
246 getConfigurationFromKernelEnvironment(KernelEnvC); \
247 return dyn_cast<ConstantInt>(ConfigC->getAggregateElement(MEMBER##Idx)); \
258#undef KERNEL_ENVIRONMENT_CONFIGURATION_GETTER
262 constexpr const int InitKernelEnvironmentArgNo = 0;
263 return cast<GlobalVariable>(
277struct AAHeapToShared;
288 OpenMPPostLink(OpenMPPostLink) {
291 const Triple T(OMPBuilder.M.getTargetTriple());
292 switch (
T.getArch()) {
296 assert(OMPBuilder.Config.IsTargetDevice &&
297 "OpenMP AMDGPU/NVPTX is only prepared to deal with device code.");
298 OMPBuilder.Config.IsGPU =
true;
301 OMPBuilder.Config.IsGPU =
false;
304 OMPBuilder.initialize();
305 initializeRuntimeFunctions(M);
306 initializeInternalControlVars();
310 struct InternalControlVarInfo {
337 struct RuntimeFunctionInfo {
361 void clearUsesMap() { UsesMap.
clear(); }
364 operator bool()
const {
return Declaration; }
367 UseVector &getOrCreateUseVector(
Function *
F) {
368 std::shared_ptr<UseVector> &UV = UsesMap[
F];
370 UV = std::make_shared<UseVector>();
376 const UseVector *getUseVector(
Function &
F)
const {
377 auto I = UsesMap.find(&
F);
378 if (
I != UsesMap.end())
379 return I->second.get();
384 size_t getNumFunctionsWithUses()
const {
return UsesMap.size(); }
388 size_t getNumArgs()
const {
return ArgumentTypes.
size(); }
406 UseVector &UV = getOrCreateUseVector(
F);
416 while (!ToBeDeleted.
empty()) {
430 decltype(UsesMap)::iterator
begin() {
return UsesMap.
begin(); }
431 decltype(UsesMap)::iterator
end() {
return UsesMap.
end(); }
439 RuntimeFunction::OMPRTL___last>
447 InternalControlVar::ICV___last>
452 void initializeInternalControlVars() {
453#define ICV_RT_SET(_Name, RTL) \
455 auto &ICV = ICVs[_Name]; \
458#define ICV_RT_GET(Name, RTL) \
460 auto &ICV = ICVs[Name]; \
463#define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \
465 auto &ICV = ICVs[Enum]; \
468 ICV.InitKind = Init; \
469 ICV.EnvVarName = _EnvVarName; \
470 switch (ICV.InitKind) { \
471 case ICV_IMPLEMENTATION_DEFINED: \
472 ICV.InitValue = nullptr; \
475 ICV.InitValue = ConstantInt::get( \
476 Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \
479 ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \
485#include "llvm/Frontend/OpenMP/OMPKinds.def"
491 static bool declMatchesRTFTypes(
Function *
F,
Type *RTFRetType,
498 if (
F->getReturnType() != RTFRetType)
500 if (
F->arg_size() != RTFArgTypes.
size())
503 auto *RTFTyIt = RTFArgTypes.
begin();
505 if (Arg.getType() != *RTFTyIt)
515 unsigned collectUses(RuntimeFunctionInfo &RFI,
bool CollectStats =
true) {
516 unsigned NumUses = 0;
517 if (!RFI.Declaration)
522 NumOpenMPRuntimeFunctionsIdentified += 1;
523 NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
527 for (
Use &U : RFI.Declaration->uses()) {
528 if (
Instruction *UserI = dyn_cast<Instruction>(
U.getUser())) {
529 if (!
CGSCC ||
CGSCC->empty() ||
CGSCC->contains(UserI->getFunction())) {
530 RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
534 RFI.getOrCreateUseVector(
nullptr).push_back(&U);
543 auto &RFI = RFIs[RTF];
545 collectUses(RFI,
false);
549 void recollectUses() {
550 for (
int Idx = 0;
Idx < RFIs.size(); ++
Idx)
570 RuntimeFunctionInfo &RFI = RFIs[Fn];
572 if (RFI.Declaration && RFI.Declaration->isDeclaration())
580 void initializeRuntimeFunctions(
Module &M) {
583#define OMP_TYPE(VarName, ...) \
584 Type *VarName = OMPBuilder.VarName; \
587#define OMP_ARRAY_TYPE(VarName, ...) \
588 ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \
590 PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \
591 (void)VarName##PtrTy;
593#define OMP_FUNCTION_TYPE(VarName, ...) \
594 FunctionType *VarName = OMPBuilder.VarName; \
596 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
599#define OMP_STRUCT_TYPE(VarName, ...) \
600 StructType *VarName = OMPBuilder.VarName; \
602 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
605#define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \
607 SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \
608 Function *F = M.getFunction(_Name); \
609 RTLFunctions.insert(F); \
610 if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \
611 RuntimeFunctionIDMap[F] = _Enum; \
612 auto &RFI = RFIs[_Enum]; \
615 RFI.IsVarArg = _IsVarArg; \
616 RFI.ReturnType = OMPBuilder._ReturnType; \
617 RFI.ArgumentTypes = std::move(ArgsTypes); \
618 RFI.Declaration = F; \
619 unsigned NumUses = collectUses(RFI); \
622 dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \
624 if (RFI.Declaration) \
625 dbgs() << TAG << "-> got " << NumUses << " uses in " \
626 << RFI.getNumFunctionsWithUses() \
627 << " different functions.\n"; \
631#include "llvm/Frontend/OpenMP/OMPKinds.def"
637 for (
StringRef Prefix : {
"__kmpc",
"_ZN4ompx",
"omp_"})
638 if (
F.hasFnAttribute(Attribute::NoInline) &&
639 F.getName().starts_with(Prefix) &&
640 !
F.hasFnAttribute(Attribute::OptimizeNone))
641 F.removeFnAttr(Attribute::NoInline);
652 bool OpenMPPostLink =
false;
655template <
typename Ty,
bool InsertInval
idates = true>
657 bool contains(
const Ty &Elem)
const {
return Set.contains(Elem); }
658 bool insert(
const Ty &Elem) {
659 if (InsertInvalidates)
661 return Set.insert(Elem);
664 const Ty &operator[](
int Idx)
const {
return Set[
Idx]; }
665 bool operator==(
const BooleanStateWithSetVector &RHS)
const {
666 return BooleanState::operator==(RHS) &&
Set ==
RHS.Set;
668 bool operator!=(
const BooleanStateWithSetVector &RHS)
const {
669 return !(*
this ==
RHS);
672 bool empty()
const {
return Set.empty(); }
673 size_t size()
const {
return Set.size(); }
676 BooleanStateWithSetVector &
operator^=(
const BooleanStateWithSetVector &RHS) {
677 BooleanState::operator^=(RHS);
678 Set.insert(
RHS.Set.begin(),
RHS.Set.end());
687 typename decltype(
Set)::iterator
begin() {
return Set.begin(); }
688 typename decltype(
Set)::iterator
end() {
return Set.end(); }
693template <
typename Ty,
bool InsertInval
idates = true>
694using BooleanStateWithPtrSetVector =
695 BooleanStateWithSetVector<Ty *, InsertInvalidates>;
699 bool IsAtFixpoint =
false;
703 BooleanStateWithPtrSetVector<
CallBase,
false>
704 ReachedKnownParallelRegions;
707 BooleanStateWithPtrSetVector<CallBase> ReachedUnknownParallelRegions;
712 BooleanStateWithPtrSetVector<Instruction, false> SPMDCompatibilityTracker;
727 bool IsKernelEntry =
false;
730 BooleanStateWithPtrSetVector<Function, false> ReachingKernelEntries;
735 BooleanStateWithSetVector<uint8_t> ParallelLevels;
738 bool NestedParallelism =
false;
743 KernelInfoState() =
default;
744 KernelInfoState(
bool BestState) {
753 bool isAtFixpoint()
const override {
return IsAtFixpoint; }
758 ParallelLevels.indicatePessimisticFixpoint();
759 ReachingKernelEntries.indicatePessimisticFixpoint();
760 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
761 ReachedKnownParallelRegions.indicatePessimisticFixpoint();
762 ReachedUnknownParallelRegions.indicatePessimisticFixpoint();
763 NestedParallelism =
true;
770 ParallelLevels.indicateOptimisticFixpoint();
771 ReachingKernelEntries.indicateOptimisticFixpoint();
772 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
773 ReachedKnownParallelRegions.indicateOptimisticFixpoint();
774 ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
779 KernelInfoState &getAssumed() {
return *
this; }
780 const KernelInfoState &getAssumed()
const {
return *
this; }
782 bool operator==(
const KernelInfoState &RHS)
const {
783 if (SPMDCompatibilityTracker !=
RHS.SPMDCompatibilityTracker)
785 if (ReachedKnownParallelRegions !=
RHS.ReachedKnownParallelRegions)
787 if (ReachedUnknownParallelRegions !=
RHS.ReachedUnknownParallelRegions)
789 if (ReachingKernelEntries !=
RHS.ReachingKernelEntries)
791 if (ParallelLevels !=
RHS.ParallelLevels)
793 if (NestedParallelism !=
RHS.NestedParallelism)
799 bool mayContainParallelRegion() {
800 return !ReachedKnownParallelRegions.empty() ||
801 !ReachedUnknownParallelRegions.empty();
805 static KernelInfoState getBestState() {
return KernelInfoState(
true); }
807 static KernelInfoState getBestState(KernelInfoState &KIS) {
808 return getBestState();
812 static KernelInfoState getWorstState() {
return KernelInfoState(
false); }
815 KernelInfoState
operator^=(
const KernelInfoState &KIS) {
817 if (KIS.KernelInitCB) {
818 if (KernelInitCB && KernelInitCB != KIS.KernelInitCB)
821 KernelInitCB = KIS.KernelInitCB;
823 if (KIS.KernelDeinitCB) {
824 if (KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB)
827 KernelDeinitCB = KIS.KernelDeinitCB;
829 if (KIS.KernelEnvC) {
830 if (KernelEnvC && KernelEnvC != KIS.KernelEnvC)
833 KernelEnvC = KIS.KernelEnvC;
835 SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;
836 ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;
837 ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;
838 NestedParallelism |= KIS.NestedParallelism;
842 KernelInfoState
operator&=(
const KernelInfoState &KIS) {
843 return (*
this ^= KIS);
859 OffloadArray() =
default;
866 if (!
Array.getAllocatedType()->isArrayTy())
869 if (!getValues(Array,
Before))
872 this->Array = &
Array;
876 static const unsigned DeviceIDArgNum = 1;
877 static const unsigned BasePtrsArgNum = 3;
878 static const unsigned PtrsArgNum = 4;
879 static const unsigned SizesArgNum = 5;
887 const uint64_t NumValues =
Array.getAllocatedType()->getArrayNumElements();
888 StoredValues.
assign(NumValues,
nullptr);
889 LastAccesses.
assign(NumValues,
nullptr);
894 if (BB !=
Before.getParent())
904 if (!isa<StoreInst>(&
I))
907 auto *S = cast<StoreInst>(&
I);
914 LastAccesses[
Idx] = S;
924 const unsigned NumValues = StoredValues.
size();
925 for (
unsigned I = 0;
I < NumValues; ++
I) {
926 if (!StoredValues[
I] || !LastAccesses[
I])
936 using OptimizationRemarkGetter =
940 OptimizationRemarkGetter OREGetter,
941 OMPInformationCache &OMPInfoCache,
Attributor &A)
943 OREGetter(OREGetter), OMPInfoCache(OMPInfoCache),
A(
A) {}
946 bool remarksEnabled() {
947 auto &Ctx =
M.getContext();
948 return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(
DEBUG_TYPE);
952 bool run(
bool IsModulePass) {
956 bool Changed =
false;
962 Changed |= runAttributor(IsModulePass);
965 OMPInfoCache.recollectUses();
968 Changed |= rewriteDeviceCodeStateMachine();
970 if (remarksEnabled())
971 analysisGlobalization();
978 Changed |= runAttributor(IsModulePass);
981 OMPInfoCache.recollectUses();
983 Changed |= deleteParallelRegions();
986 Changed |= hideMemTransfersLatency();
987 Changed |= deduplicateRuntimeCalls();
989 if (mergeParallelRegions()) {
990 deduplicateRuntimeCalls();
996 if (OMPInfoCache.OpenMPPostLink)
997 Changed |= removeRuntimeSymbols();
1004 void printICVs()
const {
1009 for (
auto ICV : ICVs) {
1010 auto ICVInfo = OMPInfoCache.ICVs[ICV];
1012 return ORA <<
"OpenMP ICV " <<
ore::NV(
"OpenMPICV", ICVInfo.Name)
1014 << (ICVInfo.InitValue
1015 ?
toString(ICVInfo.InitValue->getValue(), 10,
true)
1016 :
"IMPLEMENTATION_DEFINED");
1019 emitRemark<OptimizationRemarkAnalysis>(
F,
"OpenMPICVTracker",
Remark);
1025 void printKernels()
const {
1031 return ORA <<
"OpenMP GPU kernel "
1032 <<
ore::NV(
"OpenMPGPUKernel",
F->getName()) <<
"\n";
1035 emitRemark<OptimizationRemarkAnalysis>(
F,
"OpenMPGPU",
Remark);
1041 static CallInst *getCallIfRegularCall(
1042 Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI =
nullptr) {
1043 CallInst *CI = dyn_cast<CallInst>(
U.getUser());
1053 static CallInst *getCallIfRegularCall(
1054 Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI =
nullptr) {
1055 CallInst *CI = dyn_cast<CallInst>(&V);
1065 bool mergeParallelRegions() {
1066 const unsigned CallbackCalleeOperand = 2;
1067 const unsigned CallbackFirstArgOperand = 3;
1071 OMPInformationCache::RuntimeFunctionInfo &RFI =
1072 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1074 if (!RFI.Declaration)
1078 OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
1079 OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],
1080 OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],
1083 bool Changed =
false;
1089 BasicBlock *StartBB =
nullptr, *EndBB =
nullptr;
1090 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
1091 BasicBlock *CGStartBB = CodeGenIP.getBlock();
1093 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
1094 assert(StartBB !=
nullptr &&
"StartBB should not be null");
1096 assert(EndBB !=
nullptr &&
"EndBB should not be null");
1097 EndBB->getTerminator()->setSuccessor(0, CGEndBB);
1101 auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
Value &,
1102 Value &Inner,
Value *&ReplacementValue) -> InsertPointTy {
1103 ReplacementValue = &Inner;
1107 auto FiniCB = [&](InsertPointTy CodeGenIP) {
return Error::success(); };
1111 auto CreateSequentialRegion = [&](
Function *OuterFn,
1119 SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);
1123 SplitBlock(ParentBB, SeqStartI, DT, LI,
nullptr,
"seq.par.merged");
1126 "Expected a different CFG");
1130 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
1131 BasicBlock *CGStartBB = CodeGenIP.getBlock();
1133 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
1134 assert(SeqStartBB !=
nullptr &&
"SeqStartBB should not be null");
1136 assert(SeqEndBB !=
nullptr &&
"SeqEndBB should not be null");
1140 auto FiniCB = [&](InsertPointTy CodeGenIP) {
return Error::success(); };
1146 for (
User *Usr :
I.users()) {
1154 OutsideUsers.
insert(&UsrI);
1157 if (OutsideUsers.
empty())
1164 I.getType(),
DL.getAllocaAddrSpace(),
nullptr,
1165 I.getName() +
".seq.output.alloc", OuterFn->
front().
begin());
1169 new StoreInst(&
I, AllocaI, SeqStartBB->getTerminator()->getIterator());
1175 I.getName() +
".seq.output.load",
1182 InsertPointTy(ParentBB, ParentBB->
end()),
DL);
1184 OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB));
1186 OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel));
1205 assert(MergableCIs.
size() > 1 &&
"Assumed multiple mergable CIs");
1208 OR <<
"Parallel region merged with parallel region"
1209 << (MergableCIs.
size() > 2 ?
"s" :
"") <<
" at ";
1212 if (CI != MergableCIs.
back())
1218 emitRemark<OptimizationRemark>(MergableCIs.
front(),
"OMP150",
Remark);
1222 <<
" parallel regions in " << OriginalFn->
getName()
1226 EndBB =
SplitBlock(BB, MergableCIs.
back()->getNextNode(), DT, LI);
1228 SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
1232 assert(BB->getUniqueSuccessor() == StartBB &&
"Expected a different CFG");
1233 const DebugLoc DL = BB->getTerminator()->getDebugLoc();
1238 for (
auto *It = MergableCIs.
begin(), *
End = MergableCIs.
end() - 1;
1247 CreateSequentialRegion(OriginalFn, BB, ForkCI->
getNextNode(),
1259 cantFail(OMPInfoCache.OMPBuilder.createParallel(
1260 Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB,
nullptr,
nullptr,
1261 OMP_PROC_BIND_default,
false));
1265 OMPInfoCache.OMPBuilder.finalize(OriginalFn);
1272 for (
auto *CI : MergableCIs) {
1274 FunctionType *FT = OMPInfoCache.OMPBuilder.ParallelTask;
1278 for (
unsigned U = CallbackFirstArgOperand, E = CI->
arg_size(); U < E;
1288 for (
unsigned U = CallbackFirstArgOperand, E = CI->
arg_size(); U < E;
1292 U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
1295 if (CI != MergableCIs.back()) {
1298 cantFail(OMPInfoCache.OMPBuilder.createBarrier(
1307 assert(OutlinedFn != OriginalFn &&
"Outlining failed");
1308 CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn);
1309 CGUpdater.reanalyzeFunction(*OriginalFn);
1311 NumOpenMPParallelRegionsMerged += MergableCIs.size();
1319 CallInst *CI = getCallIfRegularCall(U, &RFI);
1326 RFI.foreachUse(SCC, DetectPRsCB);
1332 for (
auto &It : BB2PRMap) {
1333 auto &CIs = It.getSecond();
1348 auto IsMergable = [&](
Instruction &
I,
bool IsBeforeMergableRegion) {
1351 if (
I.isTerminator())
1354 if (!isa<CallInst>(&
I))
1358 if (IsBeforeMergableRegion) {
1360 if (!CalledFunction)
1367 for (
const auto &RFI : UnmergableCallsInfo) {
1368 if (CalledFunction == RFI.Declaration)
1376 if (!isa<IntrinsicInst>(CI))
1387 if (CIs.count(&
I)) {
1393 if (IsMergable(
I, MergableCIs.
empty()))
1398 for (; It !=
End; ++It) {
1400 if (CIs.count(&SkipI)) {
1402 <<
" due to " <<
I <<
"\n");
1409 if (MergableCIs.
size() > 1) {
1410 MergableCIsVector.
push_back(MergableCIs);
1412 <<
" parallel regions in block " << BB->
getName()
1417 MergableCIs.
clear();
1420 if (!MergableCIsVector.
empty()) {
1423 for (
auto &MergableCIs : MergableCIsVector)
1424 Merge(MergableCIs, BB);
1425 MergableCIsVector.clear();
1432 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call);
1433 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier);
1434 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master);
1435 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master);
1442 bool deleteParallelRegions() {
1443 const unsigned CallbackCalleeOperand = 2;
1445 OMPInformationCache::RuntimeFunctionInfo &RFI =
1446 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1448 if (!RFI.Declaration)
1451 bool Changed =
false;
1453 CallInst *CI = getCallIfRegularCall(U);
1456 auto *Fn = dyn_cast<Function>(
1460 if (!Fn->onlyReadsMemory())
1462 if (!Fn->hasFnAttribute(Attribute::WillReturn))
1469 return OR <<
"Removing parallel region with no side-effects.";
1471 emitRemark<OptimizationRemark>(CI,
"OMP160",
Remark);
1475 ++NumOpenMPParallelRegionsDeleted;
1479 RFI.foreachUse(SCC, DeleteCallCB);
1485 bool deduplicateRuntimeCalls() {
1486 bool Changed =
false;
1489 OMPRTL_omp_get_num_threads,
1490 OMPRTL_omp_in_parallel,
1491 OMPRTL_omp_get_cancellation,
1492 OMPRTL_omp_get_supported_active_levels,
1493 OMPRTL_omp_get_level,
1494 OMPRTL_omp_get_ancestor_thread_num,
1495 OMPRTL_omp_get_team_size,
1496 OMPRTL_omp_get_active_level,
1497 OMPRTL_omp_in_final,
1498 OMPRTL_omp_get_proc_bind,
1499 OMPRTL_omp_get_num_places,
1500 OMPRTL_omp_get_num_procs,
1501 OMPRTL_omp_get_place_num,
1502 OMPRTL_omp_get_partition_num_places,
1503 OMPRTL_omp_get_partition_place_nums};
1507 collectGlobalThreadIdArguments(GTIdArgs);
1509 <<
" global thread ID arguments\n");
1512 for (
auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
1513 Changed |= deduplicateRuntimeCalls(
1514 *
F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
1518 Value *GTIdArg =
nullptr;
1520 if (GTIdArgs.
count(&Arg)) {
1524 Changed |= deduplicateRuntimeCalls(
1525 *
F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);
1532 bool removeRuntimeSymbols() {
1538 if (
GV->getNumUses() >= 1)
1542 GV->eraseFromParent();
1554 bool hideMemTransfersLatency() {
1555 auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
1556 bool Changed =
false;
1558 auto *RTCall = getCallIfRegularCall(U, &RFI);
1562 OffloadArray OffloadArrays[3];
1563 if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
1566 LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));
1569 bool WasSplit =
false;
1570 Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
1571 if (WaitMovementPoint)
1572 WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);
1574 Changed |= WasSplit;
1577 if (OMPInfoCache.runtimeFnsAvailable(
1578 {OMPRTL___tgt_target_data_begin_mapper_issue,
1579 OMPRTL___tgt_target_data_begin_mapper_wait}))
1580 RFI.foreachUse(SCC, SplitMemTransfers);
1585 void analysisGlobalization() {
1586 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
1588 auto CheckGlobalization = [&](
Use &
U,
Function &Decl) {
1589 if (
CallInst *CI = getCallIfRegularCall(U, &RFI)) {
1592 <<
"Found thread data sharing on the GPU. "
1593 <<
"Expect degraded performance due to data globalization.";
1595 emitRemark<OptimizationRemarkMissed>(CI,
"OMP112",
Remark);
1601 RFI.foreachUse(SCC, CheckGlobalization);
1606 bool getValuesInOffloadArrays(
CallInst &RuntimeCall,
1608 assert(OAs.
size() == 3 &&
"Need space for three offload arrays!");
1618 Value *BasePtrsArg =
1627 if (!isa<AllocaInst>(V))
1629 auto *BasePtrsArray = cast<AllocaInst>(V);
1630 if (!OAs[0].
initialize(*BasePtrsArray, RuntimeCall))
1635 if (!isa<AllocaInst>(V))
1637 auto *PtrsArray = cast<AllocaInst>(V);
1638 if (!OAs[1].
initialize(*PtrsArray, RuntimeCall))
1644 if (isa<GlobalValue>(V))
1645 return isa<Constant>(V);
1646 if (!isa<AllocaInst>(V))
1649 auto *SizesArray = cast<AllocaInst>(V);
1650 if (!OAs[2].
initialize(*SizesArray, RuntimeCall))
1661 assert(OAs.
size() == 3 &&
"There are three offload arrays to debug!");
1664 std::string ValuesStr;
1666 std::string Separator =
" --- ";
1668 for (
auto *BP : OAs[0].StoredValues) {
1672 LLVM_DEBUG(
dbgs() <<
"\t\toffload_baseptrs: " << ValuesStr <<
"\n");
1675 for (
auto *
P : OAs[1].StoredValues) {
1682 for (
auto *S : OAs[2].StoredValues) {
1686 LLVM_DEBUG(
dbgs() <<
"\t\toffload_sizes: " << ValuesStr <<
"\n");
1696 bool IsWorthIt =
false;
1715 return RuntimeCall.
getParent()->getTerminator();
1719 bool splitTargetDataBeginRTC(
CallInst &RuntimeCall,
1724 auto &
IRBuilder = OMPInfoCache.OMPBuilder;
1728 Entry.getFirstNonPHIOrDbgOrAlloca());
1730 IRBuilder.AsyncInfo,
nullptr,
"handle");
1738 M, OMPRTL___tgt_target_data_begin_mapper_issue);
1742 for (
auto &Arg : RuntimeCall.
args())
1743 Args.push_back(Arg.get());
1744 Args.push_back(Handle);
1748 OMPInfoCache.setCallingConvention(IssueDecl, IssueCallsite);
1754 M, OMPRTL___tgt_target_data_begin_mapper_wait);
1756 Value *WaitParams[2] = {
1758 OffloadArray::DeviceIDArgNum),
1762 WaitDecl, WaitParams,
"", WaitMovementPoint.
getIterator());
1763 OMPInfoCache.setCallingConvention(WaitDecl, WaitCallsite);
1768 static Value *combinedIdentStruct(
Value *CurrentIdent,
Value *NextIdent,
1769 bool GlobalOnly,
bool &SingleChoice) {
1770 if (CurrentIdent == NextIdent)
1771 return CurrentIdent;
1775 if (!GlobalOnly || isa<GlobalValue>(NextIdent)) {
1776 SingleChoice = !CurrentIdent;
1788 getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
1790 bool SingleChoice =
true;
1791 Value *Ident =
nullptr;
1793 CallInst *CI = getCallIfRegularCall(U, &RFI);
1794 if (!CI || &
F != &Caller)
1797 true, SingleChoice);
1800 RFI.foreachUse(SCC, CombineIdentStruct);
1802 if (!Ident || !SingleChoice) {
1805 if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
1807 &
F.getEntryBlock(),
F.getEntryBlock().begin()));
1812 OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1813 Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc, SrcLocStrSize);
1820 bool deduplicateRuntimeCalls(
Function &
F,
1821 OMPInformationCache::RuntimeFunctionInfo &RFI,
1822 Value *ReplVal =
nullptr) {
1823 auto *UV = RFI.getUseVector(
F);
1824 if (!UV || UV->size() + (ReplVal !=
nullptr) < 2)
1828 dbgs() <<
TAG <<
"Deduplicate " << UV->size() <<
" uses of " << RFI.Name
1829 << (ReplVal ?
" with an existing value\n" :
"\n") <<
"\n");
1831 assert((!ReplVal || (isa<Argument>(ReplVal) &&
1832 cast<Argument>(ReplVal)->
getParent() == &
F)) &&
1833 "Unexpected replacement value!");
1836 auto CanBeMoved = [
this](
CallBase &CB) {
1837 unsigned NumArgs = CB.arg_size();
1840 if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
1842 for (
unsigned U = 1;
U < NumArgs; ++
U)
1843 if (isa<Instruction>(CB.getArgOperand(U)))
1854 for (
Use *U : *UV) {
1855 if (
CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
1860 if (!CanBeMoved(*CI))
1868 assert(IP &&
"Expected insertion point!");
1869 cast<Instruction>(ReplVal)->moveBefore(IP->
getIterator());
1875 if (
CallBase *CI = dyn_cast<CallBase>(ReplVal)) {
1878 Value *Ident = getCombinedIdentFromCallUsesIn(RFI,
F,
1884 bool Changed =
false;
1886 CallInst *CI = getCallIfRegularCall(U, &RFI);
1887 if (!CI || CI == ReplVal || &
F != &Caller)
1892 return OR <<
"OpenMP runtime call "
1893 <<
ore::NV(
"OpenMPOptRuntime", RFI.Name) <<
" deduplicated.";
1896 emitRemark<OptimizationRemark>(CI,
"OMP170",
Remark);
1898 emitRemark<OptimizationRemark>(&
F,
"OMP170",
Remark);
1902 ++NumOpenMPRuntimeCallsDeduplicated;
1906 RFI.foreachUse(SCC, ReplaceAndDeleteCB);
1920 if (!
F.hasLocalLinkage())
1922 for (
Use &U :
F.uses()) {
1923 if (
CallInst *CI = getCallIfRegularCall(U)) {
1925 if (CI == &RefCI || GTIdArgs.
count(ArgOp) ||
1926 getCallIfRegularCall(
1927 *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))
1936 auto AddUserArgs = [&](
Value >Id) {
1937 for (
Use &U : GTId.uses())
1938 if (
CallInst *CI = dyn_cast<CallInst>(
U.getUser()))
1941 if (CallArgOpIsGTId(*Callee,
U.getOperandNo(), *CI))
1946 OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
1947 OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];
1949 GlobThreadNumRFI.foreachUse(SCC, [&](
Use &U,
Function &
F) {
1950 if (
CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))
1958 for (
unsigned U = 0;
U < GTIdArgs.
size(); ++
U)
1959 AddUserArgs(*GTIdArgs[U]);
1974 return getUniqueKernelFor(*
I.getFunction());
1979 bool rewriteDeviceCodeStateMachine();
1995 template <
typename RemarkKind,
typename RemarkCallBack>
1997 RemarkCallBack &&RemarkCB)
const {
1999 auto &ORE = OREGetter(
F);
2003 return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
I))
2004 <<
" [" << RemarkName <<
"]";
2008 [&]() {
return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
I)); });
2012 template <
typename RemarkKind,
typename RemarkCallBack>
2014 RemarkCallBack &&RemarkCB)
const {
2015 auto &ORE = OREGetter(
F);
2019 return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
F))
2020 <<
" [" << RemarkName <<
"]";
2024 [&]() {
return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
F)); });
2038 OptimizationRemarkGetter OREGetter;
2041 OMPInformationCache &OMPInfoCache;
2047 bool runAttributor(
bool IsModulePass) {
2051 registerAAs(IsModulePass);
2056 <<
" functions, result: " << Changed <<
".\n");
2058 if (Changed == ChangeStatus::CHANGED)
2059 OMPInfoCache.invalidateAnalyses();
2061 return Changed == ChangeStatus::CHANGED;
2068 void registerAAs(
bool IsModulePass);
2077 if (OMPInfoCache.CGSCC && !OMPInfoCache.CGSCC->empty() &&
2078 !OMPInfoCache.CGSCC->contains(&
F))
2083 std::optional<Kernel> &CachedKernel = UniqueKernelMap[&
F];
2085 return *CachedKernel;
2092 return *CachedKernel;
2095 CachedKernel =
nullptr;
2096 if (!
F.hasLocalLinkage()) {
2100 return ORA <<
"Potentially unknown OpenMP target region caller.";
2102 emitRemark<OptimizationRemarkAnalysis>(&
F,
"OMP100",
Remark);
2108 auto GetUniqueKernelForUse = [&](
const Use &
U) ->
Kernel {
2109 if (
auto *Cmp = dyn_cast<ICmpInst>(
U.getUser())) {
2111 if (
Cmp->isEquality())
2112 return getUniqueKernelFor(*Cmp);
2115 if (
auto *CB = dyn_cast<CallBase>(
U.getUser())) {
2117 if (CB->isCallee(&U))
2118 return getUniqueKernelFor(*CB);
2120 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
2121 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
2123 if (OpenMPOpt::getCallIfRegularCall(*
U.getUser(), &KernelParallelRFI))
2124 return getUniqueKernelFor(*CB);
2133 OMPInformationCache::foreachUse(
F, [&](
const Use &U) {
2134 PotentialKernels.
insert(GetUniqueKernelForUse(U));
2138 if (PotentialKernels.
size() == 1)
2139 K = *PotentialKernels.
begin();
2142 UniqueKernelMap[&
F] =
K;
2147bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
2148 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
2149 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
2151 bool Changed =
false;
2152 if (!KernelParallelRFI)
2163 bool UnknownUse =
false;
2164 bool KernelParallelUse =
false;
2165 unsigned NumDirectCalls = 0;
2168 OMPInformationCache::foreachUse(*
F, [&](
Use &U) {
2169 if (
auto *CB = dyn_cast<CallBase>(
U.getUser()))
2170 if (CB->isCallee(&U)) {
2175 if (isa<ICmpInst>(
U.getUser())) {
2176 ToBeReplacedStateMachineUses.push_back(&U);
2182 OpenMPOpt::getCallIfRegularCall(*
U.getUser(), &KernelParallelRFI);
2183 const unsigned int WrapperFunctionArgNo = 6;
2184 if (!KernelParallelUse && CI &&
2186 KernelParallelUse = true;
2187 ToBeReplacedStateMachineUses.push_back(&U);
2195 if (!KernelParallelUse)
2201 if (UnknownUse || NumDirectCalls != 1 ||
2202 ToBeReplacedStateMachineUses.
size() > 2) {
2204 return ORA <<
"Parallel region is used in "
2205 << (UnknownUse ?
"unknown" :
"unexpected")
2206 <<
" ways. Will not attempt to rewrite the state machine.";
2208 emitRemark<OptimizationRemarkAnalysis>(
F,
"OMP101",
Remark);
2217 return ORA <<
"Parallel region is not called from a unique kernel. "
2218 "Will not attempt to rewrite the state machine.";
2220 emitRemark<OptimizationRemarkAnalysis>(
F,
"OMP102",
Remark);
2236 for (
Use *U : ToBeReplacedStateMachineUses)
2238 ID,
U->get()->getType()));
2240 ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
2249struct AAICVTracker :
public StateWrapper<BooleanState, AbstractAttribute> {
2254 bool isAssumedTracked()
const {
return getAssumed(); }
2257 bool isKnownTracked()
const {
return getAssumed(); }
2266 return std::nullopt;
2272 virtual std::optional<Value *>
2280 const std::string
getName()
const override {
return "AAICVTracker"; }
2283 const char *getIdAddr()
const override {
return &
ID; }
2290 static const char ID;
2293struct AAICVTrackerFunction :
public AAICVTracker {
2295 : AAICVTracker(IRP,
A) {}
2298 const std::string getAsStr(
Attributor *)
const override {
2299 return "ICVTrackerFunction";
2303 void trackStatistics()
const override {}
2307 return ChangeStatus::UNCHANGED;
2312 InternalControlVar::ICV___last>
2313 ICVReplacementValuesMap;
2320 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2323 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2325 auto &ValuesMap = ICVReplacementValuesMap[ICV];
2327 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
2333 if (ValuesMap.insert(std::make_pair(CI, CI->
getArgOperand(0))).second)
2334 HasChanged = ChangeStatus::CHANGED;
2340 std::optional<Value *> ReplVal = getValueForCall(
A,
I, ICV);
2341 if (ReplVal && ValuesMap.insert(std::make_pair(&
I, *ReplVal)).second)
2342 HasChanged = ChangeStatus::CHANGED;
2348 SetterRFI.foreachUse(TrackValues,
F);
2350 bool UsedAssumedInformation =
false;
2351 A.checkForAllInstructions(CallCheck, *
this, {Instruction::Call},
2352 UsedAssumedInformation,
2358 if (HasChanged == ChangeStatus::CHANGED)
2359 ValuesMap.try_emplace(Entry);
2370 const auto *CB = dyn_cast<CallBase>(&
I);
2371 if (!CB || CB->hasFnAttr(
"no_openmp") ||
2372 CB->hasFnAttr(
"no_openmp_routines"))
2373 return std::nullopt;
2375 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2376 auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
2377 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2378 Function *CalledFunction = CB->getCalledFunction();
2381 if (CalledFunction ==
nullptr)
2383 if (CalledFunction == GetterRFI.Declaration)
2384 return std::nullopt;
2385 if (CalledFunction == SetterRFI.Declaration) {
2386 if (ICVReplacementValuesMap[ICV].
count(&
I))
2387 return ICVReplacementValuesMap[ICV].
lookup(&
I);
2396 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2399 if (ICVTrackingAA->isAssumedTracked()) {
2400 std::optional<Value *> URV =
2401 ICVTrackingAA->getUniqueReplacementValue(ICV);
2412 std::optional<Value *>
2414 return std::nullopt;
2421 const auto &ValuesMap = ICVReplacementValuesMap[ICV];
2422 if (ValuesMap.count(
I))
2423 return ValuesMap.lookup(
I);
2429 std::optional<Value *> ReplVal;
2431 while (!Worklist.
empty()) {
2433 if (!Visited.
insert(CurrInst).second)
2441 if (ValuesMap.count(CurrInst)) {
2442 std::optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst);
2445 ReplVal = NewReplVal;
2451 if (ReplVal != NewReplVal)
2457 std::optional<Value *> NewReplVal = getValueForCall(
A, *CurrInst, ICV);
2463 ReplVal = NewReplVal;
2469 if (ReplVal != NewReplVal)
2474 if (CurrBB ==
I->getParent() && ReplVal)
2479 if (
const Instruction *Terminator = Pred->getTerminator())
2487struct AAICVTrackerFunctionReturned : AAICVTracker {
2489 : AAICVTracker(IRP,
A) {}
2492 const std::string getAsStr(
Attributor *)
const override {
2493 return "ICVTrackerFunctionReturned";
2497 void trackStatistics()
const override {}
2501 return ChangeStatus::UNCHANGED;
2506 InternalControlVar::ICV___last>
2507 ICVReplacementValuesMap;
2510 std::optional<Value *>
2512 return ICVReplacementValuesMap[ICV];
2517 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2520 if (!ICVTrackingAA->isAssumedTracked())
2521 return indicatePessimisticFixpoint();
2524 std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2525 std::optional<Value *> UniqueICVValue;
2528 std::optional<Value *> NewReplVal =
2529 ICVTrackingAA->getReplacementValue(ICV, &
I,
A);
2532 if (UniqueICVValue && UniqueICVValue != NewReplVal)
2535 UniqueICVValue = NewReplVal;
2540 bool UsedAssumedInformation =
false;
2541 if (!
A.checkForAllInstructions(CheckReturnInst, *
this, {Instruction::Ret},
2542 UsedAssumedInformation,
2544 UniqueICVValue =
nullptr;
2546 if (UniqueICVValue == ReplVal)
2549 ReplVal = UniqueICVValue;
2550 Changed = ChangeStatus::CHANGED;
2557struct AAICVTrackerCallSite : AAICVTracker {
2559 : AAICVTracker(IRP,
A) {}
2562 assert(getAnchorScope() &&
"Expected anchor function");
2566 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2568 auto ICVInfo = OMPInfoCache.ICVs[ICV];
2569 auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter];
2570 if (Getter.Declaration == getAssociatedFunction()) {
2571 AssociatedICV = ICVInfo.Kind;
2577 indicatePessimisticFixpoint();
2581 if (!ReplVal || !*ReplVal)
2582 return ChangeStatus::UNCHANGED;
2585 A.deleteAfterManifest(*getCtxI());
2587 return ChangeStatus::CHANGED;
2591 const std::string getAsStr(
Attributor *)
const override {
2592 return "ICVTrackerCallSite";
2596 void trackStatistics()
const override {}
2599 std::optional<Value *> ReplVal;
2602 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2606 if (!ICVTrackingAA->isAssumedTracked())
2607 return indicatePessimisticFixpoint();
2609 std::optional<Value *> NewReplVal =
2610 ICVTrackingAA->getReplacementValue(AssociatedICV, getCtxI(),
A);
2612 if (ReplVal == NewReplVal)
2613 return ChangeStatus::UNCHANGED;
2615 ReplVal = NewReplVal;
2616 return ChangeStatus::CHANGED;
2621 std::optional<Value *>
2627struct AAICVTrackerCallSiteReturned : AAICVTracker {
2629 : AAICVTracker(IRP,
A) {}
2632 const std::string getAsStr(
Attributor *)
const override {
2633 return "ICVTrackerCallSiteReturned";
2637 void trackStatistics()
const override {}
2641 return ChangeStatus::UNCHANGED;
2646 InternalControlVar::ICV___last>
2647 ICVReplacementValuesMap;
2651 std::optional<Value *>
2653 return ICVReplacementValuesMap[ICV];
2658 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2660 DepClassTy::REQUIRED);
2663 if (!ICVTrackingAA->isAssumedTracked())
2664 return indicatePessimisticFixpoint();
2667 std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2668 std::optional<Value *> NewReplVal =
2669 ICVTrackingAA->getUniqueReplacementValue(ICV);
2671 if (ReplVal == NewReplVal)
2674 ReplVal = NewReplVal;
2675 Changed = ChangeStatus::CHANGED;
2683static bool hasFunctionEndAsUniqueSuccessor(
const BasicBlock *BB) {
2689 return hasFunctionEndAsUniqueSuccessor(
Successor);
2696 ~AAExecutionDomainFunction() {
delete RPOT; }
2700 assert(
F &&
"Expected anchor function");
2705 unsigned TotalBlocks = 0, InitialThreadBlocks = 0, AlignedBlocks = 0;
2706 for (
auto &It : BEDMap) {
2710 InitialThreadBlocks += It.getSecond().IsExecutedByInitialThreadOnly;
2711 AlignedBlocks += It.getSecond().IsReachedFromAlignedBarrierOnly &&
2712 It.getSecond().IsReachingAlignedBarrierOnly;
2714 return "[AAExecutionDomain] " + std::to_string(InitialThreadBlocks) +
"/" +
2715 std::to_string(AlignedBlocks) +
" of " +
2716 std::to_string(TotalBlocks) +
2717 " executed by initial thread / aligned";
2729 << BB.
getName() <<
" is executed by a single thread.\n";
2739 auto HandleAlignedBarrier = [&](
CallBase *CB) {
2740 const ExecutionDomainTy &ED = CB ? CEDMap[{CB, PRE}] : BEDMap[
nullptr];
2741 if (!ED.IsReachedFromAlignedBarrierOnly ||
2742 ED.EncounteredNonLocalSideEffect)
2744 if (!ED.EncounteredAssumes.empty() && !
A.isModulePass())
2755 DeletedBarriers.
insert(CB);
2756 A.deleteAfterManifest(*CB);
2757 ++NumBarriersEliminated;
2759 }
else if (!ED.AlignedBarriers.empty()) {
2762 ED.AlignedBarriers.end());
2764 while (!Worklist.
empty()) {
2766 if (!Visited.
insert(LastCB))
2770 if (!hasFunctionEndAsUniqueSuccessor(LastCB->
getParent()))
2772 if (!DeletedBarriers.
count(LastCB)) {
2773 ++NumBarriersEliminated;
2774 A.deleteAfterManifest(*LastCB);
2780 const ExecutionDomainTy &LastED = CEDMap[{LastCB, PRE}];
2781 Worklist.
append(LastED.AlignedBarriers.begin(),
2782 LastED.AlignedBarriers.end());
2788 if (!ED.EncounteredAssumes.empty() && (CB || !ED.AlignedBarriers.empty()))
2789 for (
auto *AssumeCB : ED.EncounteredAssumes)
2790 A.deleteAfterManifest(*AssumeCB);
2793 for (
auto *CB : AlignedBarriers)
2794 HandleAlignedBarrier(CB);
2798 HandleAlignedBarrier(
nullptr);
2810 mergeInPredecessorBarriersAndAssumptions(
Attributor &
A, ExecutionDomainTy &ED,
2811 const ExecutionDomainTy &PredED);
2816 bool mergeInPredecessor(
Attributor &
A, ExecutionDomainTy &ED,
2817 const ExecutionDomainTy &PredED,
2818 bool InitialEdgeOnly =
false);
2821 bool handleCallees(
Attributor &
A, ExecutionDomainTy &EntryBBED);
2831 assert(BB.
getParent() == getAnchorScope() &&
"Block is out of scope!");
2832 return BEDMap.lookup(&BB).IsExecutedByInitialThreadOnly;
2837 assert(
I.getFunction() == getAnchorScope() &&
2838 "Instruction is out of scope!");
2842 bool ForwardIsOk =
true;
2848 auto *CB = dyn_cast<CallBase>(CurI);
2851 if (CB != &
I && AlignedBarriers.contains(
const_cast<CallBase *
>(CB)))
2853 const auto &It = CEDMap.find({CB, PRE});
2854 if (It == CEDMap.end())
2856 if (!It->getSecond().IsReachingAlignedBarrierOnly)
2857 ForwardIsOk =
false;
2861 if (!CurI && !BEDMap.lookup(
I.getParent()).IsReachingAlignedBarrierOnly)
2862 ForwardIsOk =
false;
2867 auto *CB = dyn_cast<CallBase>(CurI);
2870 if (CB != &
I && AlignedBarriers.contains(
const_cast<CallBase *
>(CB)))
2872 const auto &It = CEDMap.find({CB, POST});
2873 if (It == CEDMap.end())
2875 if (It->getSecond().IsReachedFromAlignedBarrierOnly)
2888 return BEDMap.lookup(
nullptr).IsReachedFromAlignedBarrierOnly;
2890 return BEDMap.lookup(PredBB).IsReachedFromAlignedBarrierOnly;
2902 "No request should be made against an invalid state!");
2903 return BEDMap.lookup(&BB);
2905 std::pair<ExecutionDomainTy, ExecutionDomainTy>
2908 "No request should be made against an invalid state!");
2909 return {CEDMap.lookup({&CB, PRE}), CEDMap.lookup({&CB, POST})};
2913 "No request should be made against an invalid state!");
2914 return InterProceduralED;
2928 if (!Cmp || !
Cmp->isTrueWhenEqual() || !
Cmp->isEquality())
2936 if (
C->isAllOnesValue()) {
2937 auto *CB = dyn_cast<CallBase>(
Cmp->getOperand(0));
2938 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2939 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
2940 CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr;
2946 KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);
2952 if (
auto *
II = dyn_cast<IntrinsicInst>(
Cmp->getOperand(0)))
2953 if (
II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_tid_x)
2957 if (
auto *
II = dyn_cast<IntrinsicInst>(
Cmp->getOperand(0)))
2958 if (
II->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x)
2966 ExecutionDomainTy InterProceduralED;
2978 static bool setAndRecord(
bool &R,
bool V) {
2989void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions(
2990 Attributor &
A, ExecutionDomainTy &ED,
const ExecutionDomainTy &PredED) {
2991 for (
auto *EA : PredED.EncounteredAssumes)
2992 ED.addAssumeInst(
A, *EA);
2994 for (
auto *AB : PredED.AlignedBarriers)
2995 ED.addAlignedBarrier(
A, *AB);
2998bool AAExecutionDomainFunction::mergeInPredecessor(
2999 Attributor &
A, ExecutionDomainTy &ED,
const ExecutionDomainTy &PredED,
3000 bool InitialEdgeOnly) {
3002 bool Changed =
false;
3004 setAndRecord(ED.IsExecutedByInitialThreadOnly,
3005 InitialEdgeOnly || (PredED.IsExecutedByInitialThreadOnly &&
3006 ED.IsExecutedByInitialThreadOnly));
3008 Changed |= setAndRecord(ED.IsReachedFromAlignedBarrierOnly,
3009 ED.IsReachedFromAlignedBarrierOnly &&
3010 PredED.IsReachedFromAlignedBarrierOnly);
3011 Changed |= setAndRecord(ED.EncounteredNonLocalSideEffect,
3012 ED.EncounteredNonLocalSideEffect |
3013 PredED.EncounteredNonLocalSideEffect);
3015 if (ED.IsReachedFromAlignedBarrierOnly)
3016 mergeInPredecessorBarriersAndAssumptions(
A, ED, PredED);
3018 ED.clearAssumeInstAndAlignedBarriers();
3022bool AAExecutionDomainFunction::handleCallees(
Attributor &
A,
3023 ExecutionDomainTy &EntryBBED) {
3028 DepClassTy::OPTIONAL);
3029 if (!EDAA || !EDAA->getState().isValidState())
3032 EDAA->getExecutionDomain(*cast<CallBase>(ACS.getInstruction())));
3036 ExecutionDomainTy ExitED;
3037 bool AllCallSitesKnown;
3038 if (
A.checkForAllCallSites(PredForCallSite, *
this,
3040 AllCallSitesKnown)) {
3041 for (
const auto &[CSInED, CSOutED] : CallSiteEDs) {
3042 mergeInPredecessor(
A, EntryBBED, CSInED);
3043 ExitED.IsReachingAlignedBarrierOnly &=
3044 CSOutED.IsReachingAlignedBarrierOnly;
3051 EntryBBED.IsExecutedByInitialThreadOnly =
false;
3052 EntryBBED.IsReachedFromAlignedBarrierOnly =
true;
3053 EntryBBED.EncounteredNonLocalSideEffect =
false;
3054 ExitED.IsReachingAlignedBarrierOnly =
false;
3056 EntryBBED.IsExecutedByInitialThreadOnly =
false;
3057 EntryBBED.IsReachedFromAlignedBarrierOnly =
false;
3058 EntryBBED.EncounteredNonLocalSideEffect =
true;
3059 ExitED.IsReachingAlignedBarrierOnly =
false;
3063 bool Changed =
false;
3064 auto &FnED = BEDMap[
nullptr];
3065 Changed |= setAndRecord(FnED.IsReachedFromAlignedBarrierOnly,
3066 FnED.IsReachedFromAlignedBarrierOnly &
3067 EntryBBED.IsReachedFromAlignedBarrierOnly);
3068 Changed |= setAndRecord(FnED.IsReachingAlignedBarrierOnly,
3069 FnED.IsReachingAlignedBarrierOnly &
3070 ExitED.IsReachingAlignedBarrierOnly);
3071 Changed |= setAndRecord(FnED.IsExecutedByInitialThreadOnly,
3072 EntryBBED.IsExecutedByInitialThreadOnly);
3078 bool Changed =
false;
3083 auto HandleAlignedBarrier = [&](
CallBase &CB, ExecutionDomainTy &ED) {
3084 Changed |= AlignedBarriers.insert(&CB);
3086 auto &CallInED = CEDMap[{&CB, PRE}];
3087 Changed |= mergeInPredecessor(
A, CallInED, ED);
3088 CallInED.IsReachingAlignedBarrierOnly =
true;
3090 ED.EncounteredNonLocalSideEffect =
false;
3091 ED.IsReachedFromAlignedBarrierOnly =
true;
3093 ED.clearAssumeInstAndAlignedBarriers();
3094 ED.addAlignedBarrier(
A, CB);
3095 auto &CallOutED = CEDMap[{&CB, POST}];
3096 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3100 A.getAAFor<
AAIsDead>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
3107 for (
auto &RIt : *RPOT) {
3110 bool IsEntryBB = &BB == &EntryBB;
3113 bool AlignedBarrierLastInBlock = IsEntryBB && IsKernel;
3114 bool IsExplicitlyAligned = IsEntryBB && IsKernel;
3115 ExecutionDomainTy ED;
3118 Changed |= handleCallees(
A, ED);
3122 if (LivenessAA && LivenessAA->isAssumedDead(&BB))
3126 if (LivenessAA && LivenessAA->isEdgeDead(PredBB, &BB))
3128 bool InitialEdgeOnly = isInitialThreadOnlyEdge(
3129 A, dyn_cast<BranchInst>(PredBB->getTerminator()), BB);
3130 mergeInPredecessor(
A, ED, BEDMap[PredBB], InitialEdgeOnly);
3137 bool UsedAssumedInformation;
3138 if (
A.isAssumedDead(
I, *
this, LivenessAA, UsedAssumedInformation,
3139 false, DepClassTy::OPTIONAL,
3145 if (
auto *
II = dyn_cast<IntrinsicInst>(&
I)) {
3146 if (
auto *AI = dyn_cast_or_null<AssumeInst>(
II)) {
3147 ED.addAssumeInst(
A, *AI);
3151 if (
II->isAssumeLikeIntrinsic())
3155 if (
auto *FI = dyn_cast<FenceInst>(&
I)) {
3156 if (!ED.EncounteredNonLocalSideEffect) {
3158 if (ED.IsReachedFromAlignedBarrierOnly)
3163 case AtomicOrdering::NotAtomic:
3165 case AtomicOrdering::Unordered:
3167 case AtomicOrdering::Monotonic:
3169 case AtomicOrdering::Acquire:
3171 case AtomicOrdering::Release:
3173 case AtomicOrdering::AcquireRelease:
3175 case AtomicOrdering::SequentiallyConsistent:
3179 NonNoOpFences.insert(FI);
3182 auto *CB = dyn_cast<CallBase>(&
I);
3184 bool IsAlignedBarrier =
3188 AlignedBarrierLastInBlock &= IsNoSync;
3189 IsExplicitlyAligned &= IsNoSync;
3195 if (IsAlignedBarrier) {
3196 HandleAlignedBarrier(*CB, ED);
3197 AlignedBarrierLastInBlock =
true;
3198 IsExplicitlyAligned =
true;
3203 if (isa<MemIntrinsic>(&
I)) {
3204 if (!ED.EncounteredNonLocalSideEffect &&
3206 ED.EncounteredNonLocalSideEffect =
true;
3208 ED.IsReachedFromAlignedBarrierOnly =
false;
3216 auto &CallInED = CEDMap[{CB, PRE}];
3217 Changed |= mergeInPredecessor(
A, CallInED, ED);
3223 if (!IsNoSync && Callee && !
Callee->isDeclaration()) {
3226 if (EDAA && EDAA->getState().isValidState()) {
3229 CalleeED.IsReachedFromAlignedBarrierOnly;
3230 AlignedBarrierLastInBlock = ED.IsReachedFromAlignedBarrierOnly;
3231 if (IsNoSync || !CalleeED.IsReachedFromAlignedBarrierOnly)
3232 ED.EncounteredNonLocalSideEffect |=
3233 CalleeED.EncounteredNonLocalSideEffect;
3235 ED.EncounteredNonLocalSideEffect =
3236 CalleeED.EncounteredNonLocalSideEffect;
3237 if (!CalleeED.IsReachingAlignedBarrierOnly) {
3239 setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3242 if (CalleeED.IsReachedFromAlignedBarrierOnly)
3243 mergeInPredecessorBarriersAndAssumptions(
A, ED, CalleeED);
3244 auto &CallOutED = CEDMap[{CB, POST}];
3245 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3250 ED.IsReachedFromAlignedBarrierOnly =
false;
3251 Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3254 AlignedBarrierLastInBlock &= ED.IsReachedFromAlignedBarrierOnly;
3256 auto &CallOutED = CEDMap[{CB, POST}];
3257 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3260 if (!
I.mayHaveSideEffects() && !
I.mayReadFromMemory())
3274 if (MemAA && MemAA->getState().isValidState() &&
3275 MemAA->checkForAllAccessesToMemoryKind(
3280 auto &InfoCache =
A.getInfoCache();
3281 if (!
I.mayHaveSideEffects() && InfoCache.isOnlyUsedByAssume(
I))
3284 if (
auto *LI = dyn_cast<LoadInst>(&
I))
3285 if (LI->hasMetadata(LLVMContext::MD_invariant_load))
3288 if (!ED.EncounteredNonLocalSideEffect &&
3290 ED.EncounteredNonLocalSideEffect =
true;
3293 bool IsEndAndNotReachingAlignedBarriersOnly =
false;
3294 if (!isa<UnreachableInst>(BB.getTerminator()) &&
3295 !BB.getTerminator()->getNumSuccessors()) {
3297 Changed |= mergeInPredecessor(
A, InterProceduralED, ED);
3299 auto &FnED = BEDMap[
nullptr];
3300 if (IsKernel && !IsExplicitlyAligned)
3301 FnED.IsReachingAlignedBarrierOnly =
false;
3302 Changed |= mergeInPredecessor(
A, FnED, ED);
3304 if (!FnED.IsReachingAlignedBarrierOnly) {
3305 IsEndAndNotReachingAlignedBarriersOnly =
true;
3306 SyncInstWorklist.
push_back(BB.getTerminator());
3307 auto &BBED = BEDMap[&BB];
3308 Changed |= setAndRecord(BBED.IsReachingAlignedBarrierOnly,
false);
3312 ExecutionDomainTy &StoredED = BEDMap[&BB];
3313 ED.IsReachingAlignedBarrierOnly = StoredED.IsReachingAlignedBarrierOnly &
3314 !IsEndAndNotReachingAlignedBarriersOnly;
3320 if (ED.IsExecutedByInitialThreadOnly !=
3321 StoredED.IsExecutedByInitialThreadOnly ||
3322 ED.IsReachedFromAlignedBarrierOnly !=
3323 StoredED.IsReachedFromAlignedBarrierOnly ||
3324 ED.EncounteredNonLocalSideEffect !=
3325 StoredED.EncounteredNonLocalSideEffect)
3329 StoredED = std::move(ED);
3335 while (!SyncInstWorklist.
empty()) {
3338 bool HitAlignedBarrierOrKnownEnd =
false;
3340 auto *CB = dyn_cast<CallBase>(CurInst);
3343 auto &CallOutED = CEDMap[{CB, POST}];
3344 Changed |= setAndRecord(CallOutED.IsReachingAlignedBarrierOnly,
false);
3345 auto &CallInED = CEDMap[{CB, PRE}];
3346 HitAlignedBarrierOrKnownEnd =
3347 AlignedBarriers.count(CB) || !CallInED.IsReachingAlignedBarrierOnly;
3348 if (HitAlignedBarrierOrKnownEnd)
3350 Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3352 if (HitAlignedBarrierOrKnownEnd)
3356 if (LivenessAA && LivenessAA->isEdgeDead(PredBB, SyncBB))
3358 if (!Visited.
insert(PredBB))
3360 auto &PredED = BEDMap[PredBB];
3361 if (setAndRecord(PredED.IsReachingAlignedBarrierOnly,
false)) {
3363 SyncInstWorklist.
push_back(PredBB->getTerminator());
3366 if (SyncBB != &EntryBB)
3369 setAndRecord(InterProceduralED.IsReachingAlignedBarrierOnly,
false);
3372 return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
3377struct AAHeapToShared :
public StateWrapper<BooleanState, AbstractAttribute> {
3382 static AAHeapToShared &createForPosition(
const IRPosition &IRP,
3386 virtual bool isAssumedHeapToShared(
CallBase &CB)
const = 0;
3390 virtual bool isAssumedHeapToSharedRemovedFree(
CallBase &CB)
const = 0;
3393 const std::string
getName()
const override {
return "AAHeapToShared"; }
3396 const char *getIdAddr()
const override {
return &
ID; }
3405 static const char ID;
3408struct AAHeapToSharedFunction :
public AAHeapToShared {
3410 : AAHeapToShared(IRP,
A) {}
3412 const std::string getAsStr(
Attributor *)
const override {
3413 return "[AAHeapToShared] " + std::to_string(MallocCalls.size()) +
3414 " malloc calls eligible.";
3418 void trackStatistics()
const override {}
3422 void findPotentialRemovedFreeCalls(
Attributor &
A) {
3423 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3424 auto &FreeRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
3426 PotentialRemovedFreeCalls.clear();
3430 for (
auto *U : CB->
users()) {
3432 if (
C &&
C->getCalledFunction() == FreeRFI.Declaration)
3436 if (FreeCalls.
size() != 1)
3439 PotentialRemovedFreeCalls.insert(FreeCalls.
front());
3445 indicatePessimisticFixpoint();
3449 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3450 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3451 if (!RFI.Declaration)
3456 bool &) -> std::optional<Value *> {
return nullptr; };
3459 for (
User *U : RFI.Declaration->
users())
3460 if (
CallBase *CB = dyn_cast<CallBase>(U)) {
3463 MallocCalls.insert(CB);
3468 findPotentialRemovedFreeCalls(
A);
3471 bool isAssumedHeapToShared(
CallBase &CB)
const override {
3472 return isValidState() && MallocCalls.count(&CB);
3475 bool isAssumedHeapToSharedRemovedFree(
CallBase &CB)
const override {
3476 return isValidState() && PotentialRemovedFreeCalls.count(&CB);
3480 if (MallocCalls.empty())
3481 return ChangeStatus::UNCHANGED;
3483 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3484 auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
3488 DepClassTy::OPTIONAL);
3493 if (HS &&
HS->isAssumedHeapToStack(*CB))
3498 for (
auto *U : CB->
users()) {
3500 if (
C &&
C->getCalledFunction() == FreeCall.Declaration)
3503 if (FreeCalls.
size() != 1)
3510 <<
" with shared memory."
3511 <<
" Shared memory usage is limited to "
3517 <<
" with " << AllocSize->getZExtValue()
3518 <<
" bytes of shared memory\n");
3524 Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue());
3529 static_cast<unsigned>(AddressSpace::Shared));
3531 SharedMem, PointerType::getUnqual(
M->getContext()));
3534 return OR <<
"Replaced globalized variable with "
3535 <<
ore::NV(
"SharedMemory", AllocSize->getZExtValue())
3536 << (AllocSize->isOne() ?
" byte " :
" bytes ")
3537 <<
"of shared memory.";
3543 "HeapToShared on allocation without alignment attribute");
3544 SharedMem->setAlignment(*Alignment);
3547 A.deleteAfterManifest(*CB);
3548 A.deleteAfterManifest(*FreeCalls.
front());
3550 SharedMemoryUsed += AllocSize->getZExtValue();
3551 NumBytesMovedToSharedMemory = SharedMemoryUsed;
3552 Changed = ChangeStatus::CHANGED;
3559 if (MallocCalls.empty())
3560 return indicatePessimisticFixpoint();
3561 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3562 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3563 if (!RFI.Declaration)
3564 return ChangeStatus::UNCHANGED;
3568 auto NumMallocCalls = MallocCalls.size();
3571 for (
User *U : RFI.Declaration->
users()) {
3572 if (
CallBase *CB = dyn_cast<CallBase>(U)) {
3573 if (CB->getCaller() !=
F)
3575 if (!MallocCalls.count(CB))
3577 if (!isa<ConstantInt>(CB->getArgOperand(0))) {
3578 MallocCalls.remove(CB);
3583 if (!ED || !ED->isExecutedByInitialThreadOnly(*CB))
3584 MallocCalls.remove(CB);
3588 findPotentialRemovedFreeCalls(
A);
3590 if (NumMallocCalls != MallocCalls.size())
3591 return ChangeStatus::CHANGED;
3593 return ChangeStatus::UNCHANGED;
3601 unsigned SharedMemoryUsed = 0;
3604struct AAKernelInfo :
public StateWrapper<KernelInfoState, AbstractAttribute> {
3610 static bool requiresCalleeForCallBase() {
return false; }
3613 void trackStatistics()
const override {}
3616 const std::string getAsStr(
Attributor *)
const override {
3617 if (!isValidState())
3619 return std::string(SPMDCompatibilityTracker.isAssumed() ?
"SPMD"
3621 std::string(SPMDCompatibilityTracker.isAtFixpoint() ?
" [FIX]"
3623 std::string(
" #PRs: ") +
3624 (ReachedKnownParallelRegions.isValidState()
3625 ? std::to_string(ReachedKnownParallelRegions.size())
3627 ", #Unknown PRs: " +
3628 (ReachedUnknownParallelRegions.isValidState()
3631 ", #Reaching Kernels: " +
3632 (ReachingKernelEntries.isValidState()
3636 (ParallelLevels.isValidState()
3639 ", NestedPar: " + (NestedParallelism ?
"yes" :
"no");
3646 const std::string
getName()
const override {
return "AAKernelInfo"; }
3649 const char *getIdAddr()
const override {
return &
ID; }
3656 static const char ID;
3661struct AAKernelInfoFunction : AAKernelInfo {
3663 : AAKernelInfo(IRP,
A) {}
3668 return GuardedInstructions;
3671 void setConfigurationOfKernelEnvironment(
ConstantStruct *ConfigC) {
3673 KernelEnvC, ConfigC, {KernelInfo::ConfigurationIdx});
3674 assert(NewKernelEnvC &&
"Failed to create new kernel environment");
3675 KernelEnvC = cast<ConstantStruct>(NewKernelEnvC);
3678#define KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MEMBER) \
3679 void set##MEMBER##OfKernelEnvironment(ConstantInt *NewVal) { \
3680 ConstantStruct *ConfigC = \
3681 KernelInfo::getConfigurationFromKernelEnvironment(KernelEnvC); \
3682 Constant *NewConfigC = ConstantFoldInsertValueInstruction( \
3683 ConfigC, NewVal, {KernelInfo::MEMBER##Idx}); \
3684 assert(NewConfigC && "Failed to create new configuration environment"); \
3685 setConfigurationOfKernelEnvironment(cast<ConstantStruct>(NewConfigC)); \
3696#undef KERNEL_ENVIRONMENT_CONFIGURATION_SETTER
3703 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3707 OMPInformationCache::RuntimeFunctionInfo &InitRFI =
3708 OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
3709 OMPInformationCache::RuntimeFunctionInfo &DeinitRFI =
3710 OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];
3714 auto StoreCallBase = [](
Use &U,
3715 OMPInformationCache::RuntimeFunctionInfo &RFI,
3717 CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI);
3719 "Unexpected use of __kmpc_target_init or __kmpc_target_deinit!");
3721 "Multiple uses of __kmpc_target_init or __kmpc_target_deinit!");
3727 StoreCallBase(U, InitRFI, KernelInitCB);
3731 DeinitRFI.foreachUse(
3733 StoreCallBase(U, DeinitRFI, KernelDeinitCB);
3739 if (!KernelInitCB || !KernelDeinitCB)
3743 ReachingKernelEntries.insert(Fn);
3744 IsKernelEntry =
true;
3752 KernelConfigurationSimplifyCB =
3754 bool &UsedAssumedInformation) -> std::optional<Constant *> {
3755 if (!isAtFixpoint()) {
3758 UsedAssumedInformation =
true;
3759 A.recordDependence(*
this, *AA, DepClassTy::OPTIONAL);
3764 A.registerGlobalVariableSimplificationCallback(
3765 *KernelEnvGV, KernelConfigurationSimplifyCB);
3768 bool CanChangeToSPMD = OMPInfoCache.runtimeFnsAvailable(
3769 {OMPRTL___kmpc_get_hardware_thread_id_in_block,
3770 OMPRTL___kmpc_barrier_simple_spmd});
3774 KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);
3779 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
3783 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3785 setExecModeOfKernelEnvironment(AssumedExecModeC);
3792 setMinThreadsOfKernelEnvironment(ConstantInt::get(Int32Ty, MinThreads));
3794 setMaxThreadsOfKernelEnvironment(ConstantInt::get(Int32Ty,
MaxThreads));
3795 auto [MinTeams, MaxTeams] =
3798 setMinTeamsOfKernelEnvironment(ConstantInt::get(Int32Ty, MinTeams));
3800 setMaxTeamsOfKernelEnvironment(ConstantInt::get(Int32Ty, MaxTeams));
3803 KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(KernelEnvC);
3804 ConstantInt *AssumedMayUseNestedParallelismC = ConstantInt::get(
3806 setMayUseNestedParallelismOfKernelEnvironment(
3807 AssumedMayUseNestedParallelismC);
3811 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
3814 ConstantInt::get(UseGenericStateMachineC->
getIntegerType(),
false);
3815 setUseGenericStateMachineOfKernelEnvironment(
3816 AssumedUseGenericStateMachineC);
3822 if (!OMPInfoCache.RFIs[RFKind].Declaration)
3824 A.registerVirtualUseCallback(*OMPInfoCache.RFIs[RFKind].Declaration, CB);
3828 auto AddDependence = [](
Attributor &
A,
const AAKernelInfo *KI,
3831 A.recordDependence(*KI, *QueryingAA, DepClassTy::OPTIONAL);
3845 if (SPMDCompatibilityTracker.isValidState())
3846 return AddDependence(
A,
this, QueryingAA);
3848 if (!ReachedKnownParallelRegions.isValidState())
3849 return AddDependence(
A,
this, QueryingAA);
3855 RegisterVirtualUse(OMPRTL___kmpc_get_hardware_num_threads_in_block,
3856 CustomStateMachineUseCB);
3857 RegisterVirtualUse(OMPRTL___kmpc_get_warp_size, CustomStateMachineUseCB);
3858 RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_generic,
3859 CustomStateMachineUseCB);
3860 RegisterVirtualUse(OMPRTL___kmpc_kernel_parallel,
3861 CustomStateMachineUseCB);
3862 RegisterVirtualUse(OMPRTL___kmpc_kernel_end_parallel,
3863 CustomStateMachineUseCB);
3867 if (SPMDCompatibilityTracker.isAtFixpoint())
3874 if (!SPMDCompatibilityTracker.isValidState())
3875 return AddDependence(
A,
this, QueryingAA);
3878 RegisterVirtualUse(OMPRTL___kmpc_get_hardware_thread_id_in_block,
3887 if (!SPMDCompatibilityTracker.isValidState())
3888 return AddDependence(
A,
this, QueryingAA);
3889 if (SPMDCompatibilityTracker.empty())
3890 return AddDependence(
A,
this, QueryingAA);
3891 if (!mayContainParallelRegion())
3892 return AddDependence(
A,
this, QueryingAA);
3895 RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_spmd, SPMDBarrierUseCB);
3899 static std::string sanitizeForGlobalName(std::string S) {
3903 return !((C >=
'a' && C <=
'z') || (C >=
'A' && C <=
'Z') ||
3904 (C >=
'0' && C <=
'9') || C ==
'_');
3915 if (!KernelInitCB || !KernelDeinitCB)
3916 return ChangeStatus::UNCHANGED;
3920 bool HasBuiltStateMachine =
true;
3921 if (!changeToSPMDMode(
A, Changed)) {
3923 HasBuiltStateMachine = buildCustomStateMachine(
A, Changed);
3925 HasBuiltStateMachine =
false;
3932 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
3933 ExistingKernelEnvC);
3934 if (!HasBuiltStateMachine)
3935 setUseGenericStateMachineOfKernelEnvironment(
3936 OldUseGenericStateMachineVal);
3943 Changed = ChangeStatus::CHANGED;
3949 void insertInstructionGuardsHelper(
Attributor &
A) {
3950 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3952 auto CreateGuardedRegion = [&](
Instruction *RegionStartI,
3986 DT, LI, MSU,
"region.guarded.end");
3989 MSU,
"region.barrier");
3992 DT, LI, MSU,
"region.exit");
3994 SplitBlock(ParentBB, RegionStartI, DT, LI, MSU,
"region.guarded");
3997 "Expected a different CFG");
4000 ParentBB, ParentBB->
getTerminator(), DT, LI, MSU,
"region.check.tid");
4003 A.registerManifestAddedBasicBlock(*RegionEndBB);
4004 A.registerManifestAddedBasicBlock(*RegionBarrierBB);
4005 A.registerManifestAddedBasicBlock(*RegionExitBB);
4006 A.registerManifestAddedBasicBlock(*RegionStartBB);
4007 A.registerManifestAddedBasicBlock(*RegionCheckTidBB);
4009 bool HasBroadcastValues =
false;
4014 for (
Use &U :
I.uses()) {
4020 if (OutsideUses.
empty())
4023 HasBroadcastValues =
true;
4028 M,
I.getType(),
false,
4030 sanitizeForGlobalName(
4031 (
I.getName() +
".guarded.output.alloc").str()),
4033 static_cast<unsigned>(AddressSpace::Shared));
4040 I.getType(), SharedMem,
I.getName() +
".guarded.output.load",
4044 for (
Use *U : OutsideUses)
4045 A.changeUseAfterManifest(*U, *LoadI);
4048 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4054 InsertPointTy(ParentBB, ParentBB->
end()),
DL);
4055 OMPInfoCache.OMPBuilder.updateToLocation(Loc);
4058 OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4060 OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4066 InsertPointTy(RegionCheckTidBB, RegionCheckTidBB->
end()),
DL);
4067 OMPInfoCache.OMPBuilder.updateToLocation(LocRegionCheckTid);
4069 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4070 M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
4072 OMPInfoCache.OMPBuilder.Builder.CreateCall(HardwareTidFn, {});
4074 OMPInfoCache.setCallingConvention(HardwareTidFn, Tid);
4075 Value *TidCheck = OMPInfoCache.OMPBuilder.Builder.CreateIsNull(Tid);
4076 OMPInfoCache.OMPBuilder.Builder
4077 .CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB)
4083 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4084 M, OMPRTL___kmpc_barrier_simple_spmd);
4085 OMPInfoCache.OMPBuilder.updateToLocation(InsertPointTy(
4088 OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid});
4090 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4093 if (HasBroadcastValues) {
4098 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4102 auto &AllocSharedRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
4104 for (
Instruction *GuardedI : SPMDCompatibilityTracker) {
4106 if (!Visited.
insert(BB).second)
4112 while (++IP != IPEnd) {
4113 if (!IP->mayHaveSideEffects() && !IP->mayReadFromMemory())
4116 if (OpenMPOpt::getCallIfRegularCall(*
I, &AllocSharedRFI))
4118 if (!
I->user_empty() || !SPMDCompatibilityTracker.contains(
I)) {
4119 LastEffect =
nullptr;
4126 for (
auto &Reorder : Reorders)
4127 Reorder.first->
moveBefore(Reorder.second->getIterator());
4132 for (
Instruction *GuardedI : SPMDCompatibilityTracker) {
4134 auto *CalleeAA =
A.lookupAAFor<AAKernelInfo>(
4137 assert(CalleeAA !=
nullptr &&
"Expected Callee AAKernelInfo");
4138 auto &CalleeAAFunction = *cast<AAKernelInfoFunction>(CalleeAA);
4140 if (CalleeAAFunction.getGuardedInstructions().contains(GuardedI))
4143 Instruction *GuardedRegionStart =
nullptr, *GuardedRegionEnd =
nullptr;
4147 if (SPMDCompatibilityTracker.contains(&
I)) {
4148 CalleeAAFunction.getGuardedInstructions().insert(&
I);
4149 if (GuardedRegionStart)
4150 GuardedRegionEnd = &
I;
4152 GuardedRegionStart = GuardedRegionEnd = &
I;
4159 if (GuardedRegionStart) {
4161 std::make_pair(GuardedRegionStart, GuardedRegionEnd));
4162 GuardedRegionStart =
nullptr;
4163 GuardedRegionEnd =
nullptr;
4168 for (
auto &GR : GuardedRegions)
4169 CreateGuardedRegion(GR.first, GR.second);
4172 void forceSingleThreadPerWorkgroupHelper(
Attributor &
A) {
4181 auto &Ctx = getAnchorValue().getContext();
4188 KernelInitCB->
getNextNode(),
"main.thread.user_code");
4193 A.registerManifestAddedBasicBlock(*InitBB);
4194 A.registerManifestAddedBasicBlock(*UserCodeBB);
4195 A.registerManifestAddedBasicBlock(*ReturnBB);
4204 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4206 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4207 M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
4212 OMPInfoCache.setCallingConvention(ThreadIdInBlockFn, ThreadIdInBlock);
4218 ConstantInt::get(ThreadIdInBlock->
getType(), 0),
4219 "thread.is_main", InitBB);
4225 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4227 if (!SPMDCompatibilityTracker.isAssumed()) {
4228 for (
Instruction *NonCompatibleI : SPMDCompatibilityTracker) {
4229 if (!NonCompatibleI)
4233 if (
auto *CB = dyn_cast<CallBase>(NonCompatibleI))
4234 if (OMPInfoCache.RTLFunctions.contains(CB->getCalledFunction()))
4238 ORA <<
"Value has potential side effects preventing SPMD-mode "
4240 if (isa<CallBase>(NonCompatibleI)) {
4241 ORA <<
". Add `[[omp::assume(\"ompx_spmd_amenable\")]]` to "
4242 "the called function to override";
4250 << *NonCompatibleI <<
"\n");
4262 Kernel = CB->getCaller();
4270 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC);
4276 Changed = ChangeStatus::CHANGED;
4280 if (mayContainParallelRegion())
4281 insertInstructionGuardsHelper(
A);
4283 forceSingleThreadPerWorkgroupHelper(
A);
4288 "Initially non-SPMD kernel has SPMD exec mode!");
4289 setExecModeOfKernelEnvironment(
4293 ++NumOpenMPTargetRegionKernelsSPMD;
4296 return OR <<
"Transformed generic-mode kernel to SPMD-mode.";
4308 if (!ReachedKnownParallelRegions.isValidState())
4311 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4312 if (!OMPInfoCache.runtimeFnsAvailable(
4313 {OMPRTL___kmpc_get_hardware_num_threads_in_block,
4314 OMPRTL___kmpc_get_warp_size, OMPRTL___kmpc_barrier_simple_generic,
4315 OMPRTL___kmpc_kernel_parallel, OMPRTL___kmpc_kernel_end_parallel}))
4326 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
4327 ExistingKernelEnvC);
4329 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC);
4334 if (UseStateMachineC->
isZero() ||
4338 Changed = ChangeStatus::CHANGED;
4341 setUseGenericStateMachineOfKernelEnvironment(
4348 if (!mayContainParallelRegion()) {
4349 ++NumOpenMPTargetRegionKernelsWithoutStateMachine;
4352 return OR <<
"Removing unused state machine from generic-mode kernel.";
4360 if (ReachedUnknownParallelRegions.empty()) {
4361 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback;
4364 return OR <<
"Rewriting generic-mode kernel with a customized state "
4369 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback;
4372 return OR <<
"Generic-mode kernel is executed with a customized state "
4373 "machine that requires a fallback.";
4378 for (
CallBase *UnknownParallelRegionCB : ReachedUnknownParallelRegions) {
4379 if (!UnknownParallelRegionCB)
4382 return ORA <<
"Call may contain unknown parallel regions. Use "
4383 <<
"`[[omp::assume(\"omp_no_parallelism\")]]` to "
4421 auto &Ctx = getAnchorValue().getContext();
4425 BasicBlock *InitBB = KernelInitCB->getParent();
4427 KernelInitCB->getNextNode(),
"thread.user_code.check");
4431 Ctx,
"worker_state_machine.begin",
Kernel, UserCodeEntryBB);
4433 Ctx,
"worker_state_machine.finished",
Kernel, UserCodeEntryBB);
4435 Ctx,
"worker_state_machine.is_active.check",
Kernel, UserCodeEntryBB);
4438 Kernel, UserCodeEntryBB);
4441 Kernel, UserCodeEntryBB);
4443 Ctx,
"worker_state_machine.done.barrier",
Kernel, UserCodeEntryBB);
4444 A.registerManifestAddedBasicBlock(*InitBB);
4445 A.registerManifestAddedBasicBlock(*UserCodeEntryBB);
4446 A.registerManifestAddedBasicBlock(*IsWorkerCheckBB);
4447 A.registerManifestAddedBasicBlock(*StateMachineBeginBB);
4448 A.registerManifestAddedBasicBlock(*StateMachineFinishedBB);
4449 A.registerManifestAddedBasicBlock(*StateMachineIsActiveCheckBB);
4450 A.registerManifestAddedBasicBlock(*StateMachineIfCascadeCurrentBB);
4451 A.registerManifestAddedBasicBlock(*StateMachineEndParallelBB);
4452 A.registerManifestAddedBasicBlock(*StateMachineDoneBarrierBB);
4454 const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
4460 ConstantInt::get(KernelInitCB->getType(), -1),
4461 "thread.is_worker", InitBB);
4467 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4468 M, OMPRTL___kmpc_get_hardware_num_threads_in_block);
4470 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4471 M, OMPRTL___kmpc_get_warp_size);
4474 OMPInfoCache.setCallingConvention(BlockHwSizeFn, BlockHwSize);
4478 OMPInfoCache.setCallingConvention(WarpSizeFn, WarpSize);
4481 BlockHwSize, WarpSize,
"block.size", IsWorkerCheckBB);
4485 "thread.is_main_or_worker", IsWorkerCheckBB);
4488 IsMainOrWorker, IsWorkerCheckBB);
4492 Type *VoidPtrTy = PointerType::getUnqual(Ctx);
4494 new AllocaInst(VoidPtrTy,
DL.getAllocaAddrSpace(),
nullptr,
4498 OMPInfoCache.OMPBuilder.updateToLocation(
4501 StateMachineBeginBB->
end()),
4504 Value *Ident = KernelInfo::getIdentFromKernelEnvironment(KernelEnvC);
4505 Value *GTid = KernelInitCB;
4508 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4509 M, OMPRTL___kmpc_barrier_simple_generic);
4512 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4516 (
unsigned int)AddressSpace::Generic) {
4518 WorkFnAI, PointerType::get(Ctx, (
unsigned int)AddressSpace::Generic),
4519 WorkFnAI->
getName() +
".generic", StateMachineBeginBB);
4524 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4525 M, OMPRTL___kmpc_kernel_parallel);
4527 KernelParallelFn, {WorkFnAI},
"worker.is_active", StateMachineBeginBB);
4528 OMPInfoCache.setCallingConvention(KernelParallelFn, IsActiveWorker);
4531 StateMachineBeginBB);
4541 StateMachineBeginBB);
4542 IsDone->setDebugLoc(DLoc);
4544 IsDone, StateMachineBeginBB)
4548 StateMachineDoneBarrierBB, IsActiveWorker,
4549 StateMachineIsActiveCheckBB)
4555 const unsigned int WrapperFunctionArgNo = 6;
4560 for (
int I = 0, E = ReachedKnownParallelRegions.size();
I < E; ++
I) {
4561 auto *CB = ReachedKnownParallelRegions[
I];
4562 auto *ParallelRegion = dyn_cast<Function>(
4563 CB->getArgOperand(WrapperFunctionArgNo)->stripPointerCasts());
4565 Ctx,
"worker_state_machine.parallel_region.execute",
Kernel,
4566 StateMachineEndParallelBB);
4568 ->setDebugLoc(DLoc);
4574 Kernel, StateMachineEndParallelBB);
4575 A.registerManifestAddedBasicBlock(*PRExecuteBB);
4576 A.registerManifestAddedBasicBlock(*PRNextBB);
4581 if (
I + 1 < E || !ReachedUnknownParallelRegions.empty()) {
4584 "worker.check_parallel_region", StateMachineIfCascadeCurrentBB);
4592 StateMachineIfCascadeCurrentBB)
4594 StateMachineIfCascadeCurrentBB = PRNextBB;
4600 if (!ReachedUnknownParallelRegions.empty()) {
4601 StateMachineIfCascadeCurrentBB->
setName(
4602 "worker_state_machine.parallel_region.fallback.execute");
4604 StateMachineIfCascadeCurrentBB)
4605 ->setDebugLoc(DLoc);
4608 StateMachineIfCascadeCurrentBB)
4612 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4613 M, OMPRTL___kmpc_kernel_end_parallel);
4616 OMPInfoCache.setCallingConvention(EndParallelFn, EndParallel);
4622 ->setDebugLoc(DLoc);
4632 KernelInfoState StateBefore = getState();
4638 struct UpdateKernelEnvCRAII {
4639 AAKernelInfoFunction &AA;
4641 UpdateKernelEnvCRAII(AAKernelInfoFunction &AA) : AA(AA) {}
4643 ~UpdateKernelEnvCRAII() {
4650 if (!AA.isValidState()) {
4651 AA.KernelEnvC = ExistingKernelEnvC;
4655 if (!AA.ReachedKnownParallelRegions.isValidState())
4656 AA.setUseGenericStateMachineOfKernelEnvironment(
4657 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
4658 ExistingKernelEnvC));
4660 if (!AA.SPMDCompatibilityTracker.isValidState())
4661 AA.setExecModeOfKernelEnvironment(
4662 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC));
4665 KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(
4667 ConstantInt *NewMayUseNestedParallelismC = ConstantInt::get(
4668 MayUseNestedParallelismC->
getIntegerType(), AA.NestedParallelism);
4669 AA.setMayUseNestedParallelismOfKernelEnvironment(
4670 NewMayUseNestedParallelismC);
4677 if (isa<CallBase>(
I))
4680 if (!
I.mayWriteToMemory())
4682 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
4685 DepClassTy::OPTIONAL);
4688 DepClassTy::OPTIONAL);
4689 if (UnderlyingObjsAA &&
4690 UnderlyingObjsAA->forallUnderlyingObjects([&](
Value &Obj) {
4691 if (AA::isAssumedThreadLocalObject(A, Obj, *this))
4695 auto *CB = dyn_cast<CallBase>(&Obj);
4696 return CB && HS && HS->isAssumedHeapToStack(*CB);
4702 SPMDCompatibilityTracker.insert(&
I);
4706 bool UsedAssumedInformationInCheckRWInst =
false;
4707 if (!SPMDCompatibilityTracker.isAtFixpoint())
4708 if (!
A.checkForAllReadWriteInstructions(
4709 CheckRWInst, *
this, UsedAssumedInformationInCheckRWInst))
4712 bool UsedAssumedInformationFromReachingKernels =
false;
4713 if (!IsKernelEntry) {
4714 updateParallelLevels(
A);
4716 bool AllReachingKernelsKnown =
true;
4717 updateReachingKernelEntries(
A, AllReachingKernelsKnown);
4718 UsedAssumedInformationFromReachingKernels = !AllReachingKernelsKnown;
4720 if (!SPMDCompatibilityTracker.empty()) {
4721 if (!ParallelLevels.isValidState())
4722 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4723 else if (!ReachingKernelEntries.isValidState())
4724 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4730 for (
auto *
Kernel : ReachingKernelEntries) {
4731 auto *CBAA =
A.getAAFor<AAKernelInfo>(
4733 if (CBAA && CBAA->SPMDCompatibilityTracker.isValidState() &&
4734 CBAA->SPMDCompatibilityTracker.isAssumed())
4738 if (!CBAA || !CBAA->SPMDCompatibilityTracker.isAtFixpoint())
4739 UsedAssumedInformationFromReachingKernels =
true;
4741 if (SPMD != 0 &&
Generic != 0)
4742 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4748 bool AllParallelRegionStatesWereFixed =
true;
4749 bool AllSPMDStatesWereFixed =
true;
4751 auto &CB = cast<CallBase>(
I);
4752 auto *CBAA =
A.getAAFor<AAKernelInfo>(
4756 getState() ^= CBAA->getState();
4757 AllSPMDStatesWereFixed &= CBAA->SPMDCompatibilityTracker.isAtFixpoint();
4758 AllParallelRegionStatesWereFixed &=
4759 CBAA->ReachedKnownParallelRegions.isAtFixpoint();
4760 AllParallelRegionStatesWereFixed &=
4761 CBAA->ReachedUnknownParallelRegions.isAtFixpoint();
4765 bool UsedAssumedInformationInCheckCallInst =
false;
4766 if (!
A.checkForAllCallLikeInstructions(
4767 CheckCallInst, *
this, UsedAssumedInformationInCheckCallInst)) {
4769 <<
"Failed to visit all call-like instructions!\n";);
4770 return indicatePessimisticFixpoint();
4775 if (!UsedAssumedInformationInCheckCallInst &&
4776 AllParallelRegionStatesWereFixed) {
4777 ReachedKnownParallelRegions.indicateOptimisticFixpoint();
4778 ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
4783 if (!UsedAssumedInformationInCheckRWInst &&
4784 !UsedAssumedInformationInCheckCallInst &&
4785 !UsedAssumedInformationFromReachingKernels && AllSPMDStatesWereFixed)
4786 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
4788 return StateBefore == getState() ? ChangeStatus::UNCHANGED
4789 : ChangeStatus::CHANGED;
4795 bool &AllReachingKernelsKnown) {
4799 assert(Caller &&
"Caller is nullptr");
4801 auto *CAA =
A.getOrCreateAAFor<AAKernelInfo>(
4803 if (CAA && CAA->ReachingKernelEntries.isValidState()) {
4804 ReachingKernelEntries ^= CAA->ReachingKernelEntries;
4810 ReachingKernelEntries.indicatePessimisticFixpoint();
4815 if (!
A.checkForAllCallSites(PredCallSite, *
this,
4817 AllReachingKernelsKnown))
4818 ReachingKernelEntries.indicatePessimisticFixpoint();
4823 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4824 OMPInformationCache::RuntimeFunctionInfo &Parallel51RFI =
4825 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
4830 assert(Caller &&
"Caller is nullptr");
4834 if (CAA && CAA->ParallelLevels.isValidState()) {
4840 if (Caller == Parallel51RFI.Declaration) {
4841 ParallelLevels.indicatePessimisticFixpoint();
4845 ParallelLevels ^= CAA->ParallelLevels;
4852 ParallelLevels.indicatePessimisticFixpoint();
4857 bool AllCallSitesKnown =
true;
4858 if (!
A.checkForAllCallSites(PredCallSite, *
this,
4861 ParallelLevels.indicatePessimisticFixpoint();
4868struct AAKernelInfoCallSite : AAKernelInfo {
4870 : AAKernelInfo(IRP,
A) {}
4874 AAKernelInfo::initialize(
A);
4876 CallBase &CB = cast<CallBase>(getAssociatedValue());
4881 if (AssumptionAA && AssumptionAA->hasAssumption(
"ompx_spmd_amenable")) {
4882 indicateOptimisticFixpoint();
4890 indicateOptimisticFixpoint();
4899 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4900 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
4901 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
4903 if (!Callee || !
A.isFunctionIPOAmendable(*Callee)) {
4907 if (!AssumptionAA ||
4908 !(AssumptionAA->hasAssumption(
"omp_no_openmp") ||
4909 AssumptionAA->hasAssumption(
"omp_no_parallelism")))
4910 ReachedUnknownParallelRegions.insert(&CB);
4914 if (!SPMDCompatibilityTracker.isAtFixpoint()) {
4915 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4916 SPMDCompatibilityTracker.insert(&CB);
4921 indicateOptimisticFixpoint();
4927 if (NumCallees > 1) {
4928 indicatePessimisticFixpoint();
4935 case OMPRTL___kmpc_is_spmd_exec_mode:
4936 case OMPRTL___kmpc_distribute_static_fini:
4937 case OMPRTL___kmpc_for_static_fini:
4938 case OMPRTL___kmpc_global_thread_num:
4939 case OMPRTL___kmpc_get_hardware_num_threads_in_block:
4940 case OMPRTL___kmpc_get_hardware_num_blocks:
4941 case OMPRTL___kmpc_single:
4942 case OMPRTL___kmpc_end_single:
4943 case OMPRTL___kmpc_master:
4944 case OMPRTL___kmpc_end_master:
4945 case OMPRTL___kmpc_barrier:
4946 case OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2:
4947 case OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2:
4948 case OMPRTL___kmpc_error:
4949 case OMPRTL___kmpc_flush:
4950 case OMPRTL___kmpc_get_hardware_thread_id_in_block:
4951 case OMPRTL___kmpc_get_warp_size:
4952 case OMPRTL_omp_get_thread_num:
4953 case OMPRTL_omp_get_num_threads:
4954 case OMPRTL_omp_get_max_threads:
4955 case OMPRTL_omp_in_parallel:
4956 case OMPRTL_omp_get_dynamic:
4957 case OMPRTL_omp_get_cancellation:
4958 case OMPRTL_omp_get_nested:
4959 case OMPRTL_omp_get_schedule:
4960 case OMPRTL_omp_get_thread_limit:
4961 case OMPRTL_omp_get_supported_active_levels:
4962 case OMPRTL_omp_get_max_active_levels:
4963 case OMPRTL_omp_get_level:
4964 case OMPRTL_omp_get_ancestor_thread_num:
4965 case OMPRTL_omp_get_team_size:
4966 case OMPRTL_omp_get_active_level:
4967 case OMPRTL_omp_in_final:
4968 case OMPRTL_omp_get_proc_bind:
4969 case OMPRTL_omp_get_num_places:
4970 case OMPRTL_omp_get_num_procs:
4971 case OMPRTL_omp_get_place_proc_ids:
4972 case OMPRTL_omp_get_place_num:
4973 case OMPRTL_omp_get_partition_num_places:
4974 case OMPRTL_omp_get_partition_place_nums:
4975 case OMPRTL_omp_get_wtime:
4977 case OMPRTL___kmpc_distribute_static_init_4:
4978 case OMPRTL___kmpc_distribute_static_init_4u:
4979 case OMPRTL___kmpc_distribute_static_init_8:
4980 case OMPRTL___kmpc_distribute_static_init_8u:
4981 case OMPRTL___kmpc_for_static_init_4:
4982 case OMPRTL___kmpc_for_static_init_4u:
4983 case OMPRTL___kmpc_for_static_init_8:
4984 case OMPRTL___kmpc_for_static_init_8u: {
4986 unsigned ScheduleArgOpNo = 2;
4987 auto *ScheduleTypeCI =
4989 unsigned ScheduleTypeVal =
4990 ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0;
4992 case OMPScheduleType::UnorderedStatic:
4993 case OMPScheduleType::UnorderedStaticChunked:
4994 case OMPScheduleType::OrderedDistribute:
4995 case OMPScheduleType::OrderedDistributeChunked:
4998 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4999 SPMDCompatibilityTracker.insert(&CB);
5003 case OMPRTL___kmpc_target_init:
5006 case OMPRTL___kmpc_target_deinit:
5007 KernelDeinitCB = &CB;
5009 case OMPRTL___kmpc_parallel_51:
5010 if (!handleParallel51(
A, CB))
5011 indicatePessimisticFixpoint();
5013 case OMPRTL___kmpc_omp_task:
5015 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5016 SPMDCompatibilityTracker.insert(&CB);
5017 ReachedUnknownParallelRegions.insert(&CB);
5019 case OMPRTL___kmpc_alloc_shared:
5020 case OMPRTL___kmpc_free_shared:
5026 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5027 SPMDCompatibilityTracker.insert(&CB);
5033 indicateOptimisticFixpoint();
5037 A.getAAFor<
AACallEdges>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
5038 if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) {
5039 CheckCallee(getAssociatedFunction(), 1);
5042 const auto &OptimisticEdges = AACE->getOptimisticEdges();
5043 for (
auto *Callee : OptimisticEdges) {
5044 CheckCallee(Callee, OptimisticEdges.size());
5055 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
5056 KernelInfoState StateBefore = getState();
5058 auto CheckCallee = [&](
Function *
F,
int NumCallees) {
5059 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(
F);
5063 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
5066 A.getAAFor<AAKernelInfo>(*
this, FnPos, DepClassTy::REQUIRED);
5068 return indicatePessimisticFixpoint();
5069 if (getState() == FnAA->getState())
5070 return ChangeStatus::UNCHANGED;
5071 getState() = FnAA->getState();
5072 return ChangeStatus::CHANGED;
5075 return indicatePessimisticFixpoint();
5077 CallBase &CB = cast<CallBase>(getAssociatedValue());
5078 if (It->getSecond() == OMPRTL___kmpc_parallel_51) {
5079 if (!handleParallel51(
A, CB))
5080 return indicatePessimisticFixpoint();
5081 return StateBefore == getState() ? ChangeStatus::UNCHANGED
5082 : ChangeStatus::CHANGED;
5088 (It->getSecond() == OMPRTL___kmpc_alloc_shared ||
5089 It->getSecond() == OMPRTL___kmpc_free_shared) &&
5090 "Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call");
5094 auto *HeapToSharedAA =
A.getAAFor<AAHeapToShared>(
5102 case OMPRTL___kmpc_alloc_shared:
5103 if ((!HeapToStackAA || !HeapToStackAA->isAssumedHeapToStack(CB)) &&
5104 (!HeapToSharedAA || !HeapToSharedAA->isAssumedHeapToShared(CB)))
5105 SPMDCompatibilityTracker.insert(&CB);
5107 case OMPRTL___kmpc_free_shared:
5108 if ((!HeapToStackAA ||
5109 !HeapToStackAA->isAssumedHeapToStackRemovedFree(CB)) &&
5111 !HeapToSharedAA->isAssumedHeapToSharedRemovedFree(CB)))
5112 SPMDCompatibilityTracker.insert(&CB);
5115 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5116 SPMDCompatibilityTracker.insert(&CB);
5118 return ChangeStatus::CHANGED;
5122 A.getAAFor<
AACallEdges>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
5123 if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) {
5124 if (
Function *
F = getAssociatedFunction())
5128 for (
auto *Callee : OptimisticEdges) {
5129 CheckCallee(Callee, OptimisticEdges.size());
5135 return StateBefore == getState() ? ChangeStatus::UNCHANGED
5136 : ChangeStatus::CHANGED;
5142 const unsigned int NonWrapperFunctionArgNo = 5;
5143 const unsigned int WrapperFunctionArgNo = 6;
5144 auto ParallelRegionOpArgNo = SPMDCompatibilityTracker.isAssumed()
5145 ? NonWrapperFunctionArgNo
5146 : WrapperFunctionArgNo;
5148 auto *ParallelRegion = dyn_cast<Function>(
5150 if (!ParallelRegion)
5153 ReachedKnownParallelRegions.insert(&CB);
5155 auto *FnAA =
A.getAAFor<AAKernelInfo>(
5157 NestedParallelism |= !FnAA || !FnAA->getState().isValidState() ||
5158 !FnAA->ReachedKnownParallelRegions.empty() ||
5159 !FnAA->ReachedKnownParallelRegions.isValidState() ||
5160 !FnAA->ReachedUnknownParallelRegions.isValidState() ||
5161 !FnAA->ReachedUnknownParallelRegions.empty();
5166struct AAFoldRuntimeCall
5167 :
public StateWrapper<BooleanState, AbstractAttribute> {
5173 void trackStatistics()
const override {}
5176 static AAFoldRuntimeCall &createForPosition(
const IRPosition &IRP,
5180 const std::string
getName()
const override {
return "AAFoldRuntimeCall"; }
5183 const char *getIdAddr()
const override {
return &
ID; }
5191 static const char ID;
5194struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
5196 : AAFoldRuntimeCall(IRP,
A) {}
5199 const std::string getAsStr(
Attributor *)
const override {
5200 if (!isValidState())
5203 std::string Str(
"simplified value: ");
5205 if (!SimplifiedValue)
5206 return Str + std::string(
"none");
5208 if (!*SimplifiedValue)
5209 return Str + std::string(
"nullptr");
5211 if (
ConstantInt *CI = dyn_cast<ConstantInt>(*SimplifiedValue))
5212 return Str + std::to_string(CI->getSExtValue());
5214 return Str + std::string(
"unknown");
5219 indicatePessimisticFixpoint();
5223 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
5224 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
5225 assert(It != OMPInfoCache.RuntimeFunctionIDMap.end() &&
5226 "Expected a known OpenMP runtime function");
5228 RFKind = It->getSecond();
5230 CallBase &CB = cast<CallBase>(getAssociatedValue());
5231 A.registerSimplificationCallback(
5234 bool &UsedAssumedInformation) -> std::optional<Value *> {
5235 assert((isValidState() ||
5236 (SimplifiedValue && *SimplifiedValue ==
nullptr)) &&
5237 "Unexpected invalid state!");
5239 if (!isAtFixpoint()) {
5240 UsedAssumedInformation =
true;
5242 A.recordDependence(*
this, *AA, DepClassTy::OPTIONAL);
5244 return SimplifiedValue;
5251 case OMPRTL___kmpc_is_spmd_exec_mode:
5252 Changed |= foldIsSPMDExecMode(
A);
5254 case OMPRTL___kmpc_parallel_level:
5255 Changed |= foldParallelLevel(
A);
5257 case OMPRTL___kmpc_get_hardware_num_threads_in_block:
5258 Changed = Changed | foldKernelFnAttribute(
A,
"omp_target_thread_limit");
5260 case OMPRTL___kmpc_get_hardware_num_blocks:
5261 Changed = Changed | foldKernelFnAttribute(
A,
"omp_target_num_teams");
5273 if (SimplifiedValue && *SimplifiedValue) {
5276 A.deleteAfterManifest(
I);
5280 if (
auto *
C = dyn_cast<ConstantInt>(*SimplifiedValue))
5281 return OR <<
"Replacing OpenMP runtime call "
5283 <<
ore::NV(
"FoldedValue",
C->getZExtValue()) <<
".";
5284 return OR <<
"Replacing OpenMP runtime call "
5292 << **SimplifiedValue <<
"\n");
5294 Changed = ChangeStatus::CHANGED;
5301 SimplifiedValue =
nullptr;
5302 return AAFoldRuntimeCall::indicatePessimisticFixpoint();
5308 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5310 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
5311 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
5312 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5315 if (!CallerKernelInfoAA ||
5316 !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5317 return indicatePessimisticFixpoint();
5319 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5321 DepClassTy::REQUIRED);
5323 if (!AA || !AA->isValidState()) {
5324 SimplifiedValue =
nullptr;
5325 return indicatePessimisticFixpoint();
5328 if (AA->SPMDCompatibilityTracker.isAssumed()) {
5329 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5334 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5335 ++KnownNonSPMDCount;
5337 ++AssumedNonSPMDCount;
5341 if ((AssumedSPMDCount + KnownSPMDCount) &&
5342 (AssumedNonSPMDCount + KnownNonSPMDCount))
5343 return indicatePessimisticFixpoint();
5345 auto &Ctx = getAnchorValue().getContext();
5346 if (KnownSPMDCount || AssumedSPMDCount) {
5347 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
5348 "Expected only SPMD kernels!");
5352 }
else if (KnownNonSPMDCount || AssumedNonSPMDCount) {
5353 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
5354 "Expected only non-SPMD kernels!");
5362 assert(!SimplifiedValue &&
"SimplifiedValue should be none");
5365 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5366 : ChangeStatus::CHANGED;
5371 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5373 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5376 if (!CallerKernelInfoAA ||
5377 !CallerKernelInfoAA->ParallelLevels.isValidState())
5378 return indicatePessimisticFixpoint();
5380 if (!CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5381 return indicatePessimisticFixpoint();
5383 if (CallerKernelInfoAA->ReachingKernelEntries.empty()) {
5384 assert(!SimplifiedValue &&
5385 "SimplifiedValue should keep none at this point");
5386 return ChangeStatus::UNCHANGED;
5389 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
5390 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
5391 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5393 DepClassTy::REQUIRED);
5394 if (!AA || !AA->SPMDCompatibilityTracker.isValidState())
5395 return indicatePessimisticFixpoint();
5397 if (AA->SPMDCompatibilityTracker.isAssumed()) {
5398 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5403 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5404 ++KnownNonSPMDCount;
5406 ++AssumedNonSPMDCount;
5410 if ((AssumedSPMDCount + KnownSPMDCount) &&
5411 (AssumedNonSPMDCount + KnownNonSPMDCount))
5412 return indicatePessimisticFixpoint();
5414 auto &Ctx = getAnchorValue().getContext();
5418 if (AssumedSPMDCount || KnownSPMDCount) {
5419 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
5420 "Expected only SPMD kernels!");
5423 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
5424 "Expected only non-SPMD kernels!");
5427 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5428 : ChangeStatus::CHANGED;
5433 int32_t CurrentAttrValue = -1;
5434 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5436 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5439 if (!CallerKernelInfoAA ||
5440 !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5441 return indicatePessimisticFixpoint();
5444 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5445 int32_t NextAttrVal =
K->getFnAttributeAsParsedInteger(Attr, -1);
5447 if (NextAttrVal == -1 ||
5448 (CurrentAttrValue != -1 && CurrentAttrValue != NextAttrVal))
5449 return indicatePessimisticFixpoint();
5450 CurrentAttrValue = NextAttrVal;
5453 if (CurrentAttrValue != -1) {
5454 auto &Ctx = getAnchorValue().getContext();
5458 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5459 : ChangeStatus::CHANGED;
5465 std::optional<Value *> SimplifiedValue;
5475 auto &RFI = OMPInfoCache.RFIs[RF];
5477 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &RFI);
5480 A.getOrCreateAAFor<AAFoldRuntimeCall>(
5482 DepClassTy::NONE,
false,
5488void OpenMPOpt::registerAAs(
bool IsModulePass) {
5498 A.getOrCreateAAFor<AAKernelInfo>(
5500 DepClassTy::NONE,
false,
5504 OMPInformationCache::RuntimeFunctionInfo &InitRFI =
5505 OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
5506 InitRFI.foreachUse(SCC, CreateKernelInfoCB);
5508 registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode);
5509 registerFoldRuntimeCall(OMPRTL___kmpc_parallel_level);
5510 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_threads_in_block);
5511 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_blocks);
5516 for (
int Idx = 0;
Idx < OMPInfoCache.ICVs.size() - 1; ++
Idx) {
5519 auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
5522 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
5526 auto &CB = cast<CallBase>(*CI);
5529 A.getOrCreateAAFor<AAICVTracker>(CBPos);
5533 GetterRFI.foreachUse(SCC, CreateAA);
5542 for (
auto *
F : SCC) {
5543 if (
F->isDeclaration())
5549 if (
F->hasLocalLinkage()) {
5551 const auto *CB = dyn_cast<CallBase>(U.getUser());
5552 return CB && CB->isCallee(&U) &&
5553 A.isRunOn(const_cast<Function *>(CB->getCaller()));
5557 registerAAsForFunction(
A, *
F);
5567 if (
F.hasFnAttribute(Attribute::Convergent))
5571 if (
auto *LI = dyn_cast<LoadInst>(&
I)) {
5572 bool UsedAssumedInformation =
false;
5579 if (
auto *CI = dyn_cast<CallBase>(&
I)) {
5584 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
5590 if (
auto *FI = dyn_cast<FenceInst>(&
I)) {
5594 if (
auto *
II = dyn_cast<IntrinsicInst>(&
I)) {
5595 if (
II->getIntrinsicID() == Intrinsic::assume) {
5604const char AAICVTracker::ID = 0;
5605const char AAKernelInfo::ID = 0;
5607const char AAHeapToShared::ID = 0;
5608const char AAFoldRuntimeCall::ID = 0;
5610AAICVTracker &AAICVTracker::createForPosition(
const IRPosition &IRP,
5612 AAICVTracker *AA =
nullptr;
5620 AA =
new (
A.Allocator) AAICVTrackerFunctionReturned(IRP,
A);
5623 AA =
new (
A.Allocator) AAICVTrackerCallSiteReturned(IRP,
A);
5626 AA =
new (
A.Allocator) AAICVTrackerCallSite(IRP,
A);
5629 AA =
new (
A.Allocator) AAICVTrackerFunction(IRP,
A);
5638 AAExecutionDomainFunction *AA =
nullptr;
5648 "AAExecutionDomain can only be created for function position!");
5650 AA =
new (
A.Allocator) AAExecutionDomainFunction(IRP,
A);
5657AAHeapToShared &AAHeapToShared::createForPosition(
const IRPosition &IRP,
5659 AAHeapToSharedFunction *AA =
nullptr;
5669 "AAHeapToShared can only be created for function position!");
5671 AA =
new (
A.Allocator) AAHeapToSharedFunction(IRP,
A);
5678AAKernelInfo &AAKernelInfo::createForPosition(
const IRPosition &IRP,
5680 AAKernelInfo *AA =
nullptr;
5690 AA =
new (
A.Allocator) AAKernelInfoCallSite(IRP,
A);
5693 AA =
new (
A.Allocator) AAKernelInfoFunction(IRP,
A);
5700AAFoldRuntimeCall &AAFoldRuntimeCall::createForPosition(
const IRPosition &IRP,
5702 AAFoldRuntimeCall *AA =
nullptr;
5711 llvm_unreachable(
"KernelInfo can only be created for call site position!");
5713 AA =
new (
A.Allocator) AAFoldRuntimeCallCallSiteReturned(IRP,
A);
5734 if (Kernels.contains(&
F))
5736 for (
const User *U :
F.users())
5737 if (!isa<BlockAddress>(U))
5746 return ORA <<
"Could not internalize function. "
5747 <<
"Some optimizations may not be possible. [OMP140]";
5751 bool Changed =
false;
5759 if (!
F.isDeclaration() && !Kernels.contains(&
F) && IsCalled(
F) &&
5763 }
else if (!
F.hasLocalLinkage() && !
F.hasFnAttribute(Attribute::Cold)) {
5776 if (!
F.isDeclaration() && !InternalizedMap.
lookup(&
F)) {
5795 OMPInformationCache InfoCache(M, AG,
Allocator,
nullptr, PostLink);
5797 unsigned MaxFixpointIterations =
5809 return F.hasFnAttribute(
"kernel");
5814 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache,
A);
5815 Changed |= OMPOpt.run(
true);
5820 if (!
F.isDeclaration() && !Kernels.contains(&
F) &&
5821 !
F.hasFnAttribute(Attribute::NoInline))
5822 F.addFnAttr(Attribute::AlwaysInline);
5852 Module &M = *
C.begin()->getFunction().getParent();
5875 OMPInformationCache InfoCache(*(Functions.
back()->getParent()), AG,
Allocator,
5876 &Functions, PostLink);
5878 unsigned MaxFixpointIterations =
5892 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache,
A);
5893 bool Changed = OMPOpt.run(
false);
5909 switch (
F.getCallingConv()) {
5924 auto ProcessKernel = [&](
Function &KF) {
5925 if (SeenKernels.
insert(&KF).second) {
5930 ++NumOpenMPTargetRegionKernels;
5931 Kernels.insert(&KF);
5933 ++NumNonOpenMPTargetRegionKernels;
5937 if (
NamedMDNode *MD = M.getNamedMetadata(
"nvvm.annotations"))
5938 for (
auto *
Op : MD->operands()) {
5941 MDString *KindID = dyn_cast<MDString>(
Op->getOperand(1));
5942 if (!KindID || KindID->
getString() !=
"kernel")
5945 if (
auto *KernelFn =
5946 mdconst::dyn_extract_or_null<Function>(
Op->getOperand(0)))
5947 ProcessKernel(*KernelFn);
5958 Metadata *MD = M.getModuleFlag(
"openmp");
5966 Metadata *MD = M.getModuleFlag(
"openmp-device");
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Expand Atomic instructions
static cl::opt< unsigned > SetFixpointIterations("attributor-max-iterations", cl::Hidden, cl::desc("Maximal number of fixpoint iterations."), cl::init(32))
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
This file provides interfaces used to manipulate a call graph, regardless if it is a "old style" Call...
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
dxil pretty DXIL Metadata Pretty Printer
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseSet and SmallDenseSet classes.
This file defines an array type that can be indexed using scoped enum values.
static void emitRemark(const Function &F, OptimizationRemarkEmitter &ORE, bool Skip)
static bool lookup(const GsymReader &GR, DataExtractor &Data, uint64_t &Offset, uint64_t BaseAddr, uint64_t Addr, SourceLocations &SrcLocs, llvm::Error &Err)
A Lookup helper functions.
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
This file defines constans and helpers used when dealing with OpenMP.
This file defines constans that will be used by both host and device compilation.
static constexpr auto TAG
static cl::opt< bool > HideMemoryTransferLatency("openmp-hide-memory-transfer-latency", cl::desc("[WIP] Tries to hide the latency of host to device memory" " transfers"), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptStateMachineRewrite("openmp-opt-disable-state-machine-rewrite", cl::desc("Disable OpenMP optimizations that replace the state machine."), cl::Hidden, cl::init(false))
static cl::opt< bool > EnableParallelRegionMerging("openmp-opt-enable-merging", cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintModuleAfterOptimizations("openmp-opt-print-module-after", cl::desc("Print the current module after OpenMP optimizations."), cl::Hidden, cl::init(false))
#define KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MEMBER)
#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX)
#define KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MEMBER)
static cl::opt< bool > PrintOpenMPKernels("openmp-print-gpu-kernels", cl::init(false), cl::Hidden)
static cl::opt< bool > DisableOpenMPOptFolding("openmp-opt-disable-folding", cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden, cl::init(false))
static bool isKernelCC(Function &F)
static cl::opt< bool > PrintModuleBeforeOptimizations("openmp-opt-print-module-before", cl::desc("Print the current module before OpenMP optimizations."), cl::Hidden, cl::init(false))
static cl::opt< unsigned > SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden, cl::desc("Maximal number of attributor iterations."), cl::init(256))
static cl::opt< bool > DisableInternalization("openmp-opt-disable-internalization", cl::desc("Disable function internalization."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintICVValues("openmp-print-icv-values", cl::init(false), cl::Hidden)
static cl::opt< bool > DisableOpenMPOptimizations("openmp-opt-disable", cl::desc("Disable OpenMP specific optimizations."), cl::Hidden, cl::init(false))
static cl::opt< unsigned > SharedMemoryLimit("openmp-opt-shared-limit", cl::Hidden, cl::desc("Maximum amount of shared memory to use."), cl::init(std::numeric_limits< unsigned >::max()))
static cl::opt< bool > EnableVerboseRemarks("openmp-opt-verbose-remarks", cl::desc("Enables more verbose remarks."), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptDeglobalization("openmp-opt-disable-deglobalization", cl::desc("Disable OpenMP optimizations involving deglobalization."), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptBarrierElimination("openmp-opt-disable-barrier-elimination", cl::desc("Disable OpenMP optimizations that eliminate barriers."), cl::Hidden, cl::init(false))
static cl::opt< bool > DeduceICVValues("openmp-deduce-icv-values", cl::init(false), cl::Hidden)
#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX)
#define KERNEL_ENVIRONMENT_GETTER(MEMBER, RETURNTYPE)
static cl::opt< bool > DisableOpenMPOptSPMDization("openmp-opt-disable-spmdization", cl::desc("Disable OpenMP optimizations involving SPMD-ization."), cl::Hidden, cl::init(false))
static cl::opt< bool > AlwaysInlineDeviceFunctions("openmp-opt-inline-device", cl::desc("Inline all applicable functions on the device."), cl::Hidden, cl::init(false))
FunctionAnalysisManager FAM
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static StringRef getName(Value *V)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static const int BlockSize
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, ArrayRef< StringLiteral > StandardNames)
Initialize the set of available library functions based on the specified target triple.
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
This class represents an incoming formal argument to a Function.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
AttributeSet getParamAttrs(unsigned ArgNo) const
The attributes for the argument or parameter at the given index are returned.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
reverse_iterator rbegin()
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Conditional or Unconditional Branch instruction.
bool isConditional() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Allocate memory in an ever growing pool, as if by bump-pointer.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
void setCallingConv(CallingConv::ID CC)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool doesNotAccessMemory(unsigned OpNo) const
bool isIndirectCall() const
Return true if the callsite is an indirect call.
bool isCallee(Value::const_user_iterator UI) const
Determine whether the passed iterator points to the callee operand's Use.
Value * getArgOperand(unsigned i) const
void setArgOperand(unsigned i, Value *v)
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
unsigned arg_size() const
AttributeList getAttributes() const
Return the attributes for this call.
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool isArgOperand(const Use *U) const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Function * getCaller()
Helper to get the caller (the parent function).
Wrapper to unify "old style" CallGraph and "new style" LazyCallGraph.
void initialize(LazyCallGraph &LCG, LazyCallGraph::SCC &SCC, CGSCCAnalysisManager &AM, CGSCCUpdateResult &UR)
Initializers for usage outside of a CGSCC pass, inside a CGSCC pass in the old and new pass manager (...
This class represents a function call, abstracting a target machine's calling convention.
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
@ ICMP_SLT
signed less than
static Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
This is the shared class of boolean and integer constants.
IntegerType * getIntegerType() const
Variant of the getType() method to always return an IntegerType, which reduces the amount of casting ...
static ConstantInt * getTrue(LLVMContext &Context)
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
This is an important base class in LLVM.
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
Analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const
Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...
static ErrorSuccess success()
Create a success value.
An instruction for ordering other memory operations.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this fence instruction.
A proxy from a FunctionAnalysisManager to an SCC.
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
const BasicBlock & getEntryBlock() const
const BasicBlock & front() const
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Argument * getArg(unsigned i) const
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
bool hasLocalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
@ PrivateLinkage
Like Internal, but omit from symbol table.
@ InternalLinkage
Rename collisions when linking (static functions).
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
void setInitializer(Constant *InitVal)
setInitializer - Sets the initializer for this global variable, removing any existing initializer if ...
InsertPoint - A saved insertion point.
BasicBlock * getBlock() const
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
bool isLifetimeStartOrEnd() const LLVM_READONLY
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
const Instruction * getPrevNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the previous non-debug instruction in the same basic block as 'this',...
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
bool mayReadFromMemory() const LLVM_READONLY
Return true if this instruction may read memory.
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
void setSuccessor(unsigned Idx, BasicBlock *BB)
Update the specified successor to point at the provided block.
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
A node in the call graph.
An SCC of the call graph.
A lazily constructed view of the call graph of a module.
An instruction for reading from memory.
StringRef getString() const
void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
A Module instance is used to store all the information related to an LLVM module.
const std::string & getTargetTriple() const
Get the target triple which is a string describing the target host.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
An interface to create LLVM-IR for OpenMP directives.
static std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
static std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM, LazyCallGraph &CG, CGSCCUpdateResult &UR)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
static ReturnInst * Create(LLVMContext &C, Value *retVal=nullptr, InsertPosition InsertBefore=nullptr)
A vector that has set insertion semantics.
size_type size() const
Determine the number of elements in the SetVector.
const value_type & back() const
Return the last element of the SetVector.
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Triple - Helper class for working with autoconf configuration names.
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static Type * getVoidTy(LLVMContext &C)
static IntegerType * getInt16Ty(LLVMContext &C)
static IntegerType * getInt8Ty(LLVMContext &C)
static IntegerType * getInt32Ty(LLVMContext &C)
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
void setName(const Twine &Name)
Change the name of the value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
StringRef getName() const
Return a constant reference to the value's name.
std::pair< iterator, bool > insert(const ValueT &V)
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
A raw_ostream that writes to an std::string.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
GlobalVariable * getKernelEnvironementGVFromKernelInitCB(CallBase *KernelInitCB)
ConstantStruct * getKernelEnvironementFromKernelInitCB(CallBase *KernelInitCB)
bool isValidAtPosition(const ValueAndContext &VAC, InformationCache &InfoCache)
Return true if the value of VAC is a valid at the position of VAC, that is a constant,...
bool isPotentiallyAffectedByBarrier(Attributor &A, const Instruction &I, const AbstractAttribute &QueryingAA)
Return true if I is potentially affected by a barrier.
bool isNoSyncInst(Attributor &A, const Instruction &I, const AbstractAttribute &QueryingAA)
Return true if I is a nosync instruction.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
E & operator^=(E &LHS, E RHS)
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
@ C
The default llvm calling convention, compatible with C.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
constexpr uint64_t PointerSize
aarch64 pointer size.
bool isOpenMPDevice(Module &M)
Helper to determine if M is a OpenMP target offloading device module.
bool containsOpenMP(Module &M)
Helper to determine if M contains OpenMP.
InternalControlVar
IDs for all Internal Control Variables (ICVs).
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
KernelSet getDeviceKernels(Module &M)
Get OpenMP device kernels in M.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
@ OMP_TGT_EXEC_MODE_GENERIC_SPMD
@ OMP_TGT_EXEC_MODE_GENERIC
bool isOpenMPKernel(Function &Fn)
Return true iff Fn is an OpenMP GPU kernel; Fn has the "kernel" attribute.
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)
Get begin iterator over path.
const_iterator end(StringRef path LLVM_LIFETIME_BOUND)
Get end iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool succ_empty(const Instruction *I)
std::string to_string(const T &Value)
bool operator!=(uint64_t V1, const APInt &V2)
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
@ FullLTOPostLink
Full LTO postlink (backend compile) phase.
@ ThinLTOPreLink
ThinLTO prelink (summary) phase.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
const char * toString(DWARFSectionKind Kind)
Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
Implement std::hash so that hash_code can be used in STL containers.
An abstract interface for address space information.
An abstract attribute for getting assumption information.
An abstract state for querying live call edges.
virtual const SetVector< Function * > & getOptimisticEdges() const =0
Get the optimistic edges.
bool IsReachedFromAlignedBarrierOnly
bool isExecutedByInitialThreadOnly(const Instruction &I) const
Check if an instruction is executed only by the initial thread.
static AAExecutionDomain & createForPosition(const IRPosition &IRP, Attributor &A)
Create an abstract attribute view for the position IRP.
virtual ExecutionDomainTy getFunctionExecutionDomain() const =0
virtual ExecutionDomainTy getExecutionDomain(const BasicBlock &) const =0
virtual bool isExecutedInAlignedRegion(Attributor &A, const Instruction &I) const =0
Check if the instruction I is executed in an aligned region, that is, the synchronizing effects befor...
virtual bool isNoOpFence(const FenceInst &FI) const =0
Helper function to determine if FI is a no-op given the information about its execution from ExecDoma...
static const char ID
Unique ID (due to the unique address)
An abstract interface for indirect call information interference.
An abstract interface for liveness abstract attribute.
An abstract interface for all memory location attributes (readnone/argmemonly/inaccessiblememonly/ina...
AccessKind
Simple enum to distinguish read/write/read-write accesses.
StateType::base_t MemoryLocationsKind
static bool isAlignedBarrier(const CallBase &CB, bool ExecutedAligned)
Helper function to determine if CB is an aligned (GPU) barrier.
An abstract Attribute for determining the necessity of the convergent attribute.
An abstract attribute for getting all assumption underlying objects.
Base struct for all "concrete attribute" deductions.
virtual ChangeStatus manifest(Attributor &A)
Hook for the Attributor to trigger the manifestation of the information represented by the abstract a...
virtual void initialize(Attributor &A)
Initialize the state with the information in the Attributor A.
virtual const std::string getAsStr(Attributor *A) const =0
This function should return the "summarized" assumed state as string.
virtual ChangeStatus updateImpl(Attributor &A)=0
The actual update/transfer function which has to be implemented by the derived classes.
virtual void trackStatistics() const =0
Hook to enable custom statistic tracking, called after manifest that resulted in a change if statisti...
virtual const char * getIdAddr() const =0
This function should return the address of the ID of the AbstractAttribute.
An interface to query the internal state of an abstract attribute.
virtual ChangeStatus indicatePessimisticFixpoint()=0
Indicate that the abstract state should converge to the pessimistic state.
virtual bool isAtFixpoint() const =0
Return if this abstract state is fixed, thus does not need to be updated if information changes as it...
virtual bool isValidState() const =0
Return if this abstract state is in a valid state.
virtual ChangeStatus indicateOptimisticFixpoint()=0
Indicate that the abstract state should converge to the optimistic state.
Wrapper for FunctionAnalysisManager.
Configuration for the Attributor.
std::function< void(Attributor &A, const Function &F)> InitializationCallback
Callback function to be invoked on internal functions marked live.
std::optional< unsigned > MaxFixpointIterations
Maximum number of iterations to run until fixpoint.
bool RewriteSignatures
Flag to determine if we rewrite function signatures.
OptimizationRemarkGetter OREGetter
IPOAmendableCBTy IPOAmendableCB
bool IsModulePass
Is the user of the Attributor a module pass or not.
bool DefaultInitializeLiveInternals
Flag to determine if we want to initialize all default AAs for an internal function marked live.
The fixpoint analysis framework that orchestrates the attribute deduction.
static bool isInternalizable(Function &F)
Returns true if the function F can be internalized.
std::function< std::optional< Constant * >(const GlobalVariable &, const AbstractAttribute *, bool &)> GlobalVariableSimplifictionCallbackTy
Register CB as a simplification callback.
std::function< bool(Attributor &, const AbstractAttribute *)> VirtualUseCallbackTy
static bool internalizeFunctions(SmallPtrSetImpl< Function * > &FnSet, DenseMap< Function *, Function * > &FnMap)
Make copies of each function in the set FnSet such that the copied version has internal linkage after...
std::function< std::optional< Value * >(const IRPosition &, const AbstractAttribute *, bool &)> SimplifictionCallbackTy
Register CB as a simplification callback.
Simple wrapper for a single bit (boolean) state.
Support structure for SCC passes to communicate updates the call graph back to the CGSCC pass manager...
Helper to describe and deal with positions in the LLVM-IR.
static const IRPosition callsite_returned(const CallBase &CB)
Create a position describing the returned value of CB.
static const IRPosition returned(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the returned value of F.
static const IRPosition value(const Value &V, const CallBaseContext *CBContext=nullptr)
Create a position describing the value of V.
static const IRPosition inst(const Instruction &I, const CallBaseContext *CBContext=nullptr)
Create a position describing the instruction I.
@ IRP_ARGUMENT
An attribute for a function argument.
@ IRP_RETURNED
An attribute for the function return value.
@ IRP_CALL_SITE
An attribute for a call site (function scope).
@ IRP_CALL_SITE_RETURNED
An attribute for a call site return value.
@ IRP_FUNCTION
An attribute for a function (scope).
@ IRP_FLOAT
A position that is not associated with a spot suitable for attributes.
@ IRP_CALL_SITE_ARGUMENT
An attribute for a call site argument.
@ IRP_INVALID
An invalid position.
static const IRPosition function(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the function scope of F.
Kind getPositionKind() const
Return the associated position kind.
static const IRPosition callsite_function(const CallBase &CB)
Create a position describing the function scope of CB.
Function * getAnchorScope() const
Return the Function surrounding the anchor value.
bool isValidState() const override
See AbstractState::isValidState() NOTE: For now we simply pretend that the worst possible state is in...
ChangeStatus indicatePessimisticFixpoint() override
See AbstractState::indicatePessimisticFixpoint(...)
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
Helper to tie a abstract state implementation to an abstract attribute.
StateType & getState() override
See AbstractAttribute::getState(...).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...