51#include "llvm/IR/IntrinsicsAMDGPU.h"
52#include "llvm/IR/IntrinsicsNVPTX.h"
68#define DEBUG_TYPE "openmp-opt"
71 "openmp-opt-disable",
cl::desc(
"Disable OpenMP specific optimizations."),
75 "openmp-opt-enable-merging",
81 cl::desc(
"Disable function internalization."),
92 "openmp-hide-memory-transfer-latency",
93 cl::desc(
"[WIP] Tries to hide the latency of host to device memory"
98 "openmp-opt-disable-deglobalization",
99 cl::desc(
"Disable OpenMP optimizations involving deglobalization."),
103 "openmp-opt-disable-spmdization",
104 cl::desc(
"Disable OpenMP optimizations involving SPMD-ization."),
108 "openmp-opt-disable-folding",
113 "openmp-opt-disable-state-machine-rewrite",
114 cl::desc(
"Disable OpenMP optimizations that replace the state machine."),
118 "openmp-opt-disable-barrier-elimination",
119 cl::desc(
"Disable OpenMP optimizations that eliminate barriers."),
123 "openmp-opt-print-module-after",
124 cl::desc(
"Print the current module after OpenMP optimizations."),
128 "openmp-opt-print-module-before",
129 cl::desc(
"Print the current module before OpenMP optimizations."),
133 "openmp-opt-inline-device",
144 cl::desc(
"Maximal number of attributor iterations."),
149 cl::desc(
"Maximum amount of shared memory to use."),
150 cl::init(std::numeric_limits<unsigned>::max()));
153 "Number of OpenMP runtime calls deduplicated");
155 "Number of OpenMP parallel regions deleted");
157 "Number of OpenMP runtime functions identified");
159 "Number of OpenMP runtime function uses identified");
161 "Number of OpenMP target region entry points (=kernels) identified");
163 "Number of non-OpenMP target region kernels identified");
165 "Number of OpenMP target region entry points (=kernels) executed in "
166 "SPMD-mode instead of generic-mode");
167STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine,
168 "Number of OpenMP target region entry points (=kernels) executed in "
169 "generic-mode without a state machines");
170STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback,
171 "Number of OpenMP target region entry points (=kernels) executed in "
172 "generic-mode with customized state machines with fallback");
173STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback,
174 "Number of OpenMP target region entry points (=kernels) executed in "
175 "generic-mode with customized state machines without fallback");
177 NumOpenMPParallelRegionsReplacedInGPUStateMachine,
178 "Number of OpenMP parallel regions replaced with ID in GPU state machines");
180 "Number of OpenMP parallel regions merged");
182 "Amount of memory pushed to shared memory");
183STATISTIC(NumBarriersEliminated,
"Number of redundant barriers eliminated");
211#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX) \
212 constexpr unsigned MEMBER##Idx = IDX;
217#undef KERNEL_ENVIRONMENT_IDX
219#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX) \
220 constexpr unsigned MEMBER##Idx = IDX;
230#undef KERNEL_ENVIRONMENT_CONFIGURATION_IDX
232#define KERNEL_ENVIRONMENT_GETTER(MEMBER, RETURNTYPE) \
233 RETURNTYPE *get##MEMBER##FromKernelEnvironment(ConstantStruct *KernelEnvC) { \
234 return cast<RETURNTYPE>(KernelEnvC->getAggregateElement(MEMBER##Idx)); \
240#undef KERNEL_ENVIRONMENT_GETTER
242#define KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MEMBER) \
243 ConstantInt *get##MEMBER##FromKernelEnvironment( \
244 ConstantStruct *KernelEnvC) { \
245 ConstantStruct *ConfigC = \
246 getConfigurationFromKernelEnvironment(KernelEnvC); \
247 return dyn_cast<ConstantInt>(ConfigC->getAggregateElement(MEMBER##Idx)); \
258#undef KERNEL_ENVIRONMENT_CONFIGURATION_GETTER
262 constexpr int InitKernelEnvironmentArgNo = 0;
277struct AAHeapToShared;
284 OMPInformationCache(
Module &M, AnalysisGetter &AG,
288 OpenMPPostLink(OpenMPPostLink) {
291 const Triple
T(OMPBuilder.M.getTargetTriple());
292 switch (
T.getArch()) {
296 assert(OMPBuilder.Config.IsTargetDevice &&
297 "OpenMP AMDGPU/NVPTX is only prepared to deal with device code.");
298 OMPBuilder.Config.IsGPU =
true;
301 OMPBuilder.Config.IsGPU =
false;
304 OMPBuilder.initialize();
305 initializeRuntimeFunctions(M);
306 initializeInternalControlVars();
310 struct InternalControlVarInfo {
318 StringRef EnvVarName;
324 ConstantInt *InitValue;
337 struct RuntimeFunctionInfo {
358 using UseVector = SmallVector<Use *, 16>;
361 void clearUsesMap() { UsesMap.clear(); }
364 operator bool()
const {
return Declaration; }
367 UseVector &getOrCreateUseVector(Function *
F) {
368 std::shared_ptr<UseVector> &UV = UsesMap[
F];
370 UV = std::make_shared<UseVector>();
376 const UseVector *getUseVector(Function &
F)
const {
377 auto I = UsesMap.find(&
F);
378 if (
I != UsesMap.end())
379 return I->second.get();
384 size_t getNumFunctionsWithUses()
const {
return UsesMap.size(); }
388 size_t getNumArgs()
const {
return ArgumentTypes.size(); }
393 void foreachUse(SmallVectorImpl<Function *> &SCC,
394 function_ref<
bool(Use &, Function &)> CB) {
395 for (Function *
F : SCC)
401 void foreachUse(function_ref<
bool(Use &, Function &)> CB, Function *
F) {
402 SmallVector<unsigned, 8> ToBeDeleted;
406 UseVector &UV = getOrCreateUseVector(
F);
416 while (!ToBeDeleted.
empty()) {
426 DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap;
430 decltype(UsesMap)::iterator
begin() {
return UsesMap.begin(); }
431 decltype(UsesMap)::iterator
end() {
return UsesMap.end(); }
435 OpenMPIRBuilder OMPBuilder;
439 RuntimeFunction::OMPRTL___last>
443 DenseMap<Function *, RuntimeFunction> RuntimeFunctionIDMap;
447 InternalControlVar::ICV___last>
452 void initializeInternalControlVars() {
453#define ICV_RT_SET(_Name, RTL) \
455 auto &ICV = ICVs[_Name]; \
458#define ICV_RT_GET(Name, RTL) \
460 auto &ICV = ICVs[Name]; \
463#define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \
465 auto &ICV = ICVs[Enum]; \
468 ICV.InitKind = Init; \
469 ICV.EnvVarName = _EnvVarName; \
470 switch (ICV.InitKind) { \
471 case ICV_IMPLEMENTATION_DEFINED: \
472 ICV.InitValue = nullptr; \
475 ICV.InitValue = ConstantInt::get( \
476 Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \
479 ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \
485#include "llvm/Frontend/OpenMP/OMPKinds.def"
491 static bool declMatchesRTFTypes(Function *
F,
Type *RTFRetType,
498 if (
F->getReturnType() != RTFRetType)
500 if (
F->arg_size() != RTFArgTypes.
size())
503 auto *RTFTyIt = RTFArgTypes.
begin();
504 for (Argument &Arg :
F->args()) {
505 if (Arg.getType() != *RTFTyIt)
515 unsigned collectUses(RuntimeFunctionInfo &RFI,
bool CollectStats =
true) {
516 unsigned NumUses = 0;
517 if (!RFI.Declaration)
519 OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration);
522 NumOpenMPRuntimeFunctionsIdentified += 1;
523 NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
527 for (Use &U : RFI.Declaration->uses()) {
529 if (!
CGSCC ||
CGSCC->empty() ||
CGSCC->contains(UserI->getFunction())) {
530 RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
534 RFI.getOrCreateUseVector(
nullptr).push_back(&U);
543 auto &RFI = RFIs[RTF];
545 collectUses(RFI,
false);
549 void recollectUses() {
550 for (
int Idx = 0; Idx < RFIs.size(); ++Idx)
555 void setCallingConvention(FunctionCallee Callee, CallInst *CI) {
570 RuntimeFunctionInfo &RFI = RFIs[Fn];
572 if (!RFI.Declaration || RFI.Declaration->isDeclaration())
580 void initializeRuntimeFunctions(
Module &M) {
583#define OMP_TYPE(VarName, ...) \
584 Type *VarName = OMPBuilder.VarName; \
587#define OMP_ARRAY_TYPE(VarName, ...) \
588 ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \
590 PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \
591 (void)VarName##PtrTy;
593#define OMP_FUNCTION_TYPE(VarName, ...) \
594 FunctionType *VarName = OMPBuilder.VarName; \
596 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
599#define OMP_STRUCT_TYPE(VarName, ...) \
600 StructType *VarName = OMPBuilder.VarName; \
602 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
605#define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \
607 SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \
608 Function *F = M.getFunction(_Name); \
609 RTLFunctions.insert(F); \
610 if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \
611 RuntimeFunctionIDMap[F] = _Enum; \
612 auto &RFI = RFIs[_Enum]; \
615 RFI.IsVarArg = _IsVarArg; \
616 RFI.ReturnType = OMPBuilder._ReturnType; \
617 RFI.ArgumentTypes = std::move(ArgsTypes); \
618 RFI.Declaration = F; \
619 unsigned NumUses = collectUses(RFI); \
622 dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \
624 if (RFI.Declaration) \
625 dbgs() << TAG << "-> got " << NumUses << " uses in " \
626 << RFI.getNumFunctionsWithUses() \
627 << " different functions.\n"; \
631#include "llvm/Frontend/OpenMP/OMPKinds.def"
636 for (Function &
F : M) {
637 for (StringRef Prefix : {
"__kmpc",
"_ZN4ompx",
"omp_"})
638 if (
F.hasFnAttribute(Attribute::NoInline) &&
639 F.getName().starts_with(Prefix) &&
640 !
F.hasFnAttribute(Attribute::OptimizeNone))
641 F.removeFnAttr(Attribute::NoInline);
649 DenseSet<const Function *> RTLFunctions;
652 bool OpenMPPostLink =
false;
655template <
typename Ty,
bool InsertInval
idates = true>
657 bool contains(
const Ty &Elem)
const {
return Set.contains(Elem); }
658 bool insert(
const Ty &Elem) {
659 if (InsertInvalidates)
660 BooleanState::indicatePessimisticFixpoint();
661 return Set.insert(Elem);
664 const Ty &operator[](
int Idx)
const {
return Set[Idx]; }
665 bool operator==(
const BooleanStateWithSetVector &
RHS)
const {
666 return BooleanState::operator==(
RHS) && Set ==
RHS.Set;
668 bool operator!=(
const BooleanStateWithSetVector &
RHS)
const {
669 return !(*
this ==
RHS);
672 bool empty()
const {
return Set.empty(); }
673 size_t size()
const {
return Set.size(); }
676 BooleanStateWithSetVector &
operator^=(
const BooleanStateWithSetVector &
RHS) {
677 BooleanState::operator^=(
RHS);
678 Set.insert_range(
RHS.Set);
687 typename decltype(Set)::iterator
begin() {
return Set.begin(); }
688 typename decltype(Set)::iterator
end() {
return Set.end(); }
689 typename decltype(Set)::const_iterator
begin()
const {
return Set.begin(); }
690 typename decltype(Set)::const_iterator
end()
const {
return Set.end(); }
693template <
typename Ty,
bool InsertInval
idates = true>
694using BooleanStateWithPtrSetVector =
695 BooleanStateWithSetVector<Ty *, InsertInvalidates>;
699 bool IsAtFixpoint =
false;
703 BooleanStateWithPtrSetVector<CallBase,
false>
704 ReachedKnownParallelRegions;
707 BooleanStateWithPtrSetVector<CallBase> ReachedUnknownParallelRegions;
712 BooleanStateWithPtrSetVector<Instruction, false> SPMDCompatibilityTracker;
716 CallBase *KernelInitCB =
nullptr;
720 ConstantStruct *KernelEnvC =
nullptr;
724 CallBase *KernelDeinitCB =
nullptr;
727 bool IsKernelEntry =
false;
730 BooleanStateWithPtrSetVector<Function, false> ReachingKernelEntries;
735 BooleanStateWithSetVector<uint8_t> ParallelLevels;
738 bool NestedParallelism =
false;
743 KernelInfoState() =
default;
744 KernelInfoState(
bool BestState) {
746 indicatePessimisticFixpoint();
750 bool isValidState()
const override {
return true; }
753 bool isAtFixpoint()
const override {
return IsAtFixpoint; }
758 ParallelLevels.indicatePessimisticFixpoint();
759 ReachingKernelEntries.indicatePessimisticFixpoint();
760 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
761 ReachedKnownParallelRegions.indicatePessimisticFixpoint();
762 ReachedUnknownParallelRegions.indicatePessimisticFixpoint();
763 NestedParallelism =
true;
764 return ChangeStatus::CHANGED;
770 ParallelLevels.indicateOptimisticFixpoint();
771 ReachingKernelEntries.indicateOptimisticFixpoint();
772 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
773 ReachedKnownParallelRegions.indicateOptimisticFixpoint();
774 ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
775 return ChangeStatus::UNCHANGED;
779 KernelInfoState &getAssumed() {
return *
this; }
780 const KernelInfoState &getAssumed()
const {
return *
this; }
783 if (SPMDCompatibilityTracker !=
RHS.SPMDCompatibilityTracker)
785 if (ReachedKnownParallelRegions !=
RHS.ReachedKnownParallelRegions)
787 if (ReachedUnknownParallelRegions !=
RHS.ReachedUnknownParallelRegions)
789 if (ReachingKernelEntries !=
RHS.ReachingKernelEntries)
791 if (ParallelLevels !=
RHS.ParallelLevels)
793 if (NestedParallelism !=
RHS.NestedParallelism)
799 bool mayContainParallelRegion() {
800 return !ReachedKnownParallelRegions.empty() ||
801 !ReachedUnknownParallelRegions.empty();
805 static KernelInfoState getBestState() {
return KernelInfoState(
true); }
807 static KernelInfoState getBestState(KernelInfoState &KIS) {
808 return getBestState();
812 static KernelInfoState getWorstState() {
return KernelInfoState(
false); }
815 KernelInfoState
operator^=(
const KernelInfoState &KIS) {
817 if (KIS.KernelInitCB) {
818 if (KernelInitCB && KernelInitCB != KIS.KernelInitCB)
821 KernelInitCB = KIS.KernelInitCB;
823 if (KIS.KernelDeinitCB) {
824 if (KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB)
827 KernelDeinitCB = KIS.KernelDeinitCB;
829 if (KIS.KernelEnvC) {
830 if (KernelEnvC && KernelEnvC != KIS.KernelEnvC)
833 KernelEnvC = KIS.KernelEnvC;
835 SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;
836 ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;
837 ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;
838 NestedParallelism |= KIS.NestedParallelism;
842 KernelInfoState
operator&=(
const KernelInfoState &KIS) {
843 return (*
this ^= KIS);
853 AllocaInst *Array =
nullptr;
855 SmallVector<Value *, 8> StoredValues;
857 SmallVector<StoreInst *, 8> LastAccesses;
859 OffloadArray() =
default;
865 bool initialize(AllocaInst &Array, Instruction &Before) {
866 if (!getValues(Array, Before))
869 this->Array = &Array;
873 static const unsigned DeviceIDArgNum = 1;
874 static const unsigned BasePtrsArgNum = 3;
875 static const unsigned PtrsArgNum = 4;
876 static const unsigned SizesArgNum = 5;
882 bool getValues(AllocaInst &Array, Instruction &Before) {
884 const DataLayout &
DL = Array.getDataLayout();
885 std::optional<TypeSize> ArraySize = Array.getAllocationSize(
DL);
886 if (!ArraySize || !ArraySize->isFixed())
889 const uint64_t NumValues = ArraySize->getFixedValue() /
PointerSize;
890 StoredValues.assign(NumValues,
nullptr);
891 LastAccesses.assign(NumValues,
nullptr);
899 for (Instruction &
I : *BB) {
913 if ((uint64_t)Idx < NumValues) {
915 LastAccesses[Idx] = S;
926 const unsigned NumValues = StoredValues.size();
927 for (
unsigned I = 0;
I < NumValues; ++
I) {
928 if (!StoredValues[
I] || !LastAccesses[
I])
938 using OptimizationRemarkGetter =
939 function_ref<OptimizationRemarkEmitter &(
Function *)>;
941 OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater,
942 OptimizationRemarkGetter OREGetter,
943 OMPInformationCache &OMPInfoCache, Attributor &A)
944 : M(*(*SCC.
begin())->
getParent()), SCC(SCC), CGUpdater(CGUpdater),
945 OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {}
948 bool remarksEnabled() {
949 auto &Ctx = M.getContext();
950 return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(
DEBUG_TYPE);
954 bool run(
bool IsModulePass) {
964 Changed |= runAttributor(IsModulePass);
967 OMPInfoCache.recollectUses();
970 Changed |= rewriteDeviceCodeStateMachine();
972 if (remarksEnabled())
973 analysisGlobalization();
980 Changed |= runAttributor(IsModulePass);
983 OMPInfoCache.recollectUses();
985 Changed |= deleteParallelRegions();
988 Changed |= hideMemTransfersLatency();
989 Changed |= deduplicateRuntimeCalls();
991 if (mergeParallelRegions()) {
992 deduplicateRuntimeCalls();
998 if (OMPInfoCache.OpenMPPostLink)
999 Changed |= removeRuntimeSymbols();
1006 void printICVs()
const {
1010 for (Function *
F : SCC) {
1011 for (
auto ICV : ICVs) {
1012 auto ICVInfo = OMPInfoCache.ICVs[ICV];
1013 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
1014 return ORA <<
"OpenMP ICV " <<
ore::NV(
"OpenMPICV", ICVInfo.Name)
1016 << (ICVInfo.InitValue
1017 ?
toString(ICVInfo.InitValue->getValue(), 10,
true)
1018 :
"IMPLEMENTATION_DEFINED");
1027 void printKernels()
const {
1028 for (Function *
F : SCC) {
1032 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
1033 return ORA <<
"OpenMP GPU kernel "
1034 <<
ore::NV(
"OpenMPGPUKernel",
F->getName()) <<
"\n";
1043 static CallInst *getCallIfRegularCall(
1044 Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI =
nullptr) {
1055 static CallInst *getCallIfRegularCall(
1056 Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI =
nullptr) {
1067 bool mergeParallelRegions() {
1068 const unsigned CallbackCalleeOperand = 2;
1069 const unsigned CallbackFirstArgOperand = 3;
1073 OMPInformationCache::RuntimeFunctionInfo &RFI =
1074 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1076 if (!RFI.Declaration)
1080 OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
1081 OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],
1082 OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],
1086 LoopInfo *LI =
nullptr;
1087 DominatorTree *DT =
nullptr;
1089 SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap;
1091 BasicBlock *StartBB =
nullptr, *EndBB =
nullptr;
1092 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
1094 BasicBlock *CGStartBB = CodeGenIP.getBlock();
1096 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
1097 assert(StartBB !=
nullptr &&
"StartBB should not be null");
1099 assert(EndBB !=
nullptr &&
"EndBB should not be null");
1100 EndBB->getTerminator()->setSuccessor(0, CGEndBB);
1104 auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
Value &,
1105 Value &Inner,
Value *&ReplacementValue) -> InsertPointTy {
1106 ReplacementValue = &Inner;
1110 auto FiniCB = [&](InsertPointTy CodeGenIP) {
return Error::success(); };
1114 auto CreateSequentialRegion = [&](
Function *OuterFn,
1120 BasicBlock *ParentBB = SeqStartI->getParent();
1122 SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);
1126 SplitBlock(ParentBB, SeqStartI, DT, LI,
nullptr,
"seq.par.merged");
1129 "Expected a different CFG");
1133 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
1135 BasicBlock *CGStartBB = CodeGenIP.getBlock();
1137 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
1138 assert(SeqStartBB !=
nullptr &&
"SeqStartBB should not be null");
1140 assert(SeqEndBB !=
nullptr &&
"SeqEndBB should not be null");
1144 auto FiniCB = [&](InsertPointTy CodeGenIP) {
return Error::success(); };
1148 for (Instruction &
I : *SeqStartBB) {
1149 SmallPtrSet<Instruction *, 4> OutsideUsers;
1150 for (User *Usr :
I.users()) {
1158 OutsideUsers.
insert(&UsrI);
1161 if (OutsideUsers.
empty())
1166 const DataLayout &
DL = M.getDataLayout();
1167 AllocaInst *AllocaI =
new AllocaInst(
1168 I.getType(),
DL.getAllocaAddrSpace(),
nullptr,
1173 new StoreInst(&
I, AllocaI, SeqStartBB->getTerminator()->getIterator());
1177 for (Instruction *UsrI : OutsideUsers) {
1178 LoadInst *LoadI =
new LoadInst(
I.getType(), AllocaI,
1179 I.getName() +
".seq.output.load",
1185 OpenMPIRBuilder::LocationDescription Loc(
1186 InsertPointTy(ParentBB, ParentBB->
end()),
DL);
1188 OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB));
1190 OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel));
1205 auto Merge = [&](
const SmallVectorImpl<CallInst *> &MergableCIs,
1209 assert(MergableCIs.
size() > 1 &&
"Assumed multiple mergable CIs");
1211 auto Remark = [&](OptimizationRemark
OR) {
1212 OR <<
"Parallel region merged with parallel region"
1213 << (MergableCIs.
size() > 2 ?
"s" :
"") <<
" at ";
1216 if (CI != MergableCIs.
back())
1224 Function *OriginalFn = BB->getParent();
1226 <<
" parallel regions in " << OriginalFn->
getName()
1230 EndBB =
SplitBlock(BB, MergableCIs.
back()->getNextNode(), DT, LI);
1232 SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
1236 assert(BB->getUniqueSuccessor() == StartBB &&
"Expected a different CFG");
1237 const DebugLoc DL = BB->getTerminator()->getDebugLoc();
1242 for (
auto *It = MergableCIs.
begin(), *End = MergableCIs.
end() - 1;
1251 CreateSequentialRegion(OriginalFn, BB, ForkCI->
getNextNode(),
1255 OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),
1257 IRBuilder<>::InsertPoint AllocaIP(
1263 cantFail(OMPInfoCache.OMPBuilder.createParallel(
1264 Loc, AllocaIP, {}, BodyGenCB, PrivCB, FiniCB,
1265 nullptr,
nullptr, OMP_PROC_BIND_default,
1270 OMPInfoCache.OMPBuilder.finalize(OriginalFn);
1276 SmallVector<Value *, 8>
Args;
1277 for (
auto *CI : MergableCIs) {
1279 FunctionType *FT = OMPInfoCache.OMPBuilder.ParallelTask;
1283 for (
unsigned U = CallbackFirstArgOperand,
E = CI->
arg_size(); U <
E;
1293 for (
unsigned U = CallbackFirstArgOperand,
E = CI->
arg_size(); U <
E;
1297 U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
1300 if (CI != MergableCIs.back()) {
1303 cantFail(OMPInfoCache.OMPBuilder.createBarrier(
1312 assert(OutlinedFn != OriginalFn &&
"Outlining failed");
1313 CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn);
1314 CGUpdater.reanalyzeFunction(*OriginalFn);
1316 NumOpenMPParallelRegionsMerged += MergableCIs.size();
1324 CallInst *CI = getCallIfRegularCall(U, &RFI);
1331 RFI.foreachUse(SCC, DetectPRsCB);
1337 for (
auto &It : BB2PRMap) {
1338 auto &CIs = It.getSecond();
1353 auto IsMergable = [&](
Instruction &
I,
bool IsBeforeMergableRegion) {
1356 if (
I.isTerminator())
1363 if (IsBeforeMergableRegion) {
1365 if (!CalledFunction)
1372 for (
const auto &RFI : UnmergableCallsInfo) {
1373 if (CalledFunction == RFI.Declaration)
1388 for (
auto It = BB->
begin(), End = BB->
end(); It != End;) {
1392 if (CIs.count(&
I)) {
1398 if (IsMergable(
I, MergableCIs.
empty()))
1403 for (; It != End; ++It) {
1405 if (CIs.count(&SkipI)) {
1407 <<
" due to " <<
I <<
"\n");
1414 if (MergableCIs.
size() > 1) {
1415 MergableCIsVector.
push_back(MergableCIs);
1417 <<
" parallel regions in block " << BB->
getName()
1422 MergableCIs.
clear();
1425 if (!MergableCIsVector.
empty()) {
1428 for (
auto &MergableCIs : MergableCIsVector)
1429 Merge(MergableCIs, BB);
1430 MergableCIsVector.clear();
1437 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call);
1438 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier);
1439 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master);
1440 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master);
1447 bool deleteParallelRegions() {
1448 const unsigned CallbackCalleeOperand = 2;
1450 OMPInformationCache::RuntimeFunctionInfo &RFI =
1451 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1453 if (!RFI.Declaration)
1458 CallInst *CI = getCallIfRegularCall(U);
1465 if (!Fn->onlyReadsMemory())
1467 if (!Fn->hasFnAttribute(Attribute::WillReturn))
1473 auto Remark = [&](OptimizationRemark
OR) {
1474 return OR <<
"Removing parallel region with no side-effects.";
1480 ++NumOpenMPParallelRegionsDeleted;
1484 RFI.foreachUse(SCC, DeleteCallCB);
1490 bool deduplicateRuntimeCalls() {
1494 OMPRTL_omp_get_num_threads,
1495 OMPRTL_omp_in_parallel,
1496 OMPRTL_omp_get_cancellation,
1497 OMPRTL_omp_get_supported_active_levels,
1498 OMPRTL_omp_get_level,
1499 OMPRTL_omp_get_ancestor_thread_num,
1500 OMPRTL_omp_get_team_size,
1501 OMPRTL_omp_get_active_level,
1502 OMPRTL_omp_in_final,
1503 OMPRTL_omp_get_proc_bind,
1504 OMPRTL_omp_get_num_places,
1505 OMPRTL_omp_get_num_procs,
1506 OMPRTL_omp_get_place_num,
1507 OMPRTL_omp_get_partition_num_places,
1508 OMPRTL_omp_get_partition_place_nums};
1511 SmallSetVector<Value *, 16> GTIdArgs;
1512 collectGlobalThreadIdArguments(GTIdArgs);
1514 <<
" global thread ID arguments\n");
1516 for (Function *
F : SCC) {
1517 for (
auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
1518 Changed |= deduplicateRuntimeCalls(
1519 *
F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
1523 Value *GTIdArg =
nullptr;
1524 for (Argument &Arg :
F->args())
1525 if (GTIdArgs.
count(&Arg)) {
1529 Changed |= deduplicateRuntimeCalls(
1530 *
F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);
1537 bool removeRuntimeSymbols() {
1542 if (GlobalVariable *GV = M.getNamedGlobal(
"__llvm_rpc_client")) {
1543 if (GV->hasNUsesOrMore(1))
1547 GV->eraseFromParent();
1559 bool hideMemTransfersLatency() {
1560 auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
1563 auto *RTCall = getCallIfRegularCall(U, &RFI);
1567 OffloadArray OffloadArrays[3];
1568 if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
1571 LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));
1574 bool WasSplit =
false;
1575 Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
1576 if (WaitMovementPoint)
1577 WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);
1582 if (OMPInfoCache.runtimeFnsAvailable(
1583 {OMPRTL___tgt_target_data_begin_mapper_issue,
1584 OMPRTL___tgt_target_data_begin_mapper_wait}))
1585 RFI.foreachUse(SCC, SplitMemTransfers);
1590 void analysisGlobalization() {
1591 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
1593 auto CheckGlobalization = [&](
Use &
U,
Function &Decl) {
1594 if (CallInst *CI = getCallIfRegularCall(U, &RFI)) {
1595 auto Remark = [&](OptimizationRemarkMissed ORM) {
1597 <<
"Found thread data sharing on the GPU. "
1598 <<
"Expect degraded performance due to data globalization.";
1606 RFI.foreachUse(SCC, CheckGlobalization);
1611 bool getValuesInOffloadArrays(CallInst &RuntimeCall,
1613 assert(OAs.
size() == 3 &&
"Need space for three offload arrays!");
1623 Value *BasePtrsArg =
1635 if (!OAs[0].
initialize(*BasePtrsArray, RuntimeCall))
1643 if (!OAs[1].
initialize(*PtrsArray, RuntimeCall))
1655 if (!OAs[2].
initialize(*SizesArray, RuntimeCall))
1666 assert(OAs.
size() == 3 &&
"There are three offload arrays to debug!");
1669 std::string ValuesStr;
1670 raw_string_ostream
Printer(ValuesStr);
1671 std::string Separator =
" --- ";
1673 for (
auto *BP : OAs[0].StoredValues) {
1677 LLVM_DEBUG(
dbgs() <<
"\t\toffload_baseptrs: " << ValuesStr <<
"\n");
1680 for (
auto *
P : OAs[1].StoredValues) {
1687 for (
auto *S : OAs[2].StoredValues) {
1691 LLVM_DEBUG(
dbgs() <<
"\t\toffload_sizes: " << ValuesStr <<
"\n");
1696 Instruction *canBeMovedDownwards(CallInst &RuntimeCall) {
1701 bool IsWorthIt =
false;
1720 return RuntimeCall.
getParent()->getTerminator();
1724 bool splitTargetDataBeginRTC(CallInst &RuntimeCall,
1725 Instruction &WaitMovementPoint) {
1729 auto &
IRBuilder = OMPInfoCache.OMPBuilder;
1732 IRBuilder.Builder.SetInsertPoint(&Entry,
1733 Entry.getFirstNonPHIOrDbgOrAlloca());
1735 IRBuilder.AsyncInfo,
nullptr,
"handle");
1742 FunctionCallee IssueDecl =
IRBuilder.getOrCreateRuntimeFunction(
1743 M, OMPRTL___tgt_target_data_begin_mapper_issue);
1746 SmallVector<Value *, 16>
Args;
1747 for (
auto &Arg : RuntimeCall.
args())
1748 Args.push_back(Arg.get());
1749 Args.push_back(Handle);
1753 OMPInfoCache.setCallingConvention(IssueDecl, IssueCallsite);
1758 FunctionCallee WaitDecl =
IRBuilder.getOrCreateRuntimeFunction(
1759 M, OMPRTL___tgt_target_data_begin_mapper_wait);
1761 Value *WaitParams[2] = {
1763 OffloadArray::DeviceIDArgNum),
1767 WaitDecl, WaitParams,
"", WaitMovementPoint.
getIterator());
1768 OMPInfoCache.setCallingConvention(WaitDecl, WaitCallsite);
1773 static Value *combinedIdentStruct(
Value *CurrentIdent,
Value *NextIdent,
1774 bool GlobalOnly,
bool &SingleChoice) {
1775 if (CurrentIdent == NextIdent)
1776 return CurrentIdent;
1781 SingleChoice = !CurrentIdent;
1793 getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
1794 Function &
F,
bool GlobalOnly) {
1795 bool SingleChoice =
true;
1796 Value *Ident =
nullptr;
1798 CallInst *CI = getCallIfRegularCall(U, &RFI);
1799 if (!CI || &
F != &Caller)
1802 true, SingleChoice);
1805 RFI.foreachUse(SCC, CombineIdentStruct);
1807 if (!Ident || !SingleChoice) {
1810 if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
1812 &
F.getEntryBlock(),
F.getEntryBlock().begin()));
1815 uint32_t SrcLocStrSize;
1817 OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1818 Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc, SrcLocStrSize);
1825 bool deduplicateRuntimeCalls(Function &
F,
1826 OMPInformationCache::RuntimeFunctionInfo &RFI,
1827 Value *ReplVal =
nullptr) {
1828 auto *UV = RFI.getUseVector(
F);
1829 if (!UV || UV->size() + (ReplVal !=
nullptr) < 2)
1833 dbgs() <<
TAG <<
"Deduplicate " << UV->size() <<
" uses of " << RFI.Name
1834 << (ReplVal ?
" with an existing value\n" :
"\n") <<
"\n");
1838 "Unexpected replacement value!");
1841 auto CanBeMoved = [
this](CallBase &CB) {
1842 unsigned NumArgs = CB.arg_size();
1845 if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
1847 for (
unsigned U = 1;
U < NumArgs; ++
U)
1855 OMPInfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(
F);
1859 for (Use *U : *UV) {
1860 if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
1865 if (!CanBeMoved(*CI))
1873 assert(IP &&
"Expected insertion point!");
1883 Value *Ident = getCombinedIdentFromCallUsesIn(RFI,
F,
1891 CallInst *CI = getCallIfRegularCall(U, &RFI);
1892 if (!CI || CI == ReplVal || &
F != &Caller)
1896 auto Remark = [&](OptimizationRemark
OR) {
1897 return OR <<
"OpenMP runtime call "
1898 <<
ore::NV(
"OpenMPOptRuntime", RFI.Name) <<
" deduplicated.";
1907 ++NumOpenMPRuntimeCallsDeduplicated;
1911 RFI.foreachUse(SCC, ReplaceAndDeleteCB);
1917 void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> >IdArgs) {
1924 auto CallArgOpIsGTId = [&](
Function &
F,
unsigned ArgNo, CallInst &RefCI) {
1925 if (!
F.hasLocalLinkage())
1927 for (Use &U :
F.uses()) {
1928 if (CallInst *CI = getCallIfRegularCall(U)) {
1930 if (CI == &RefCI || GTIdArgs.
count(ArgOp) ||
1931 getCallIfRegularCall(
1932 *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))
1941 auto AddUserArgs = [&](
Value >Id) {
1942 for (Use &U : GTId.uses())
1946 if (CallArgOpIsGTId(*Callee,
U.getOperandNo(), *CI))
1951 OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
1952 OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];
1954 GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &
F) {
1955 if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))
1963 for (
unsigned U = 0;
U < GTIdArgs.
size(); ++
U)
1964 AddUserArgs(*GTIdArgs[U]);
1972 DenseMap<Function *, std::optional<Kernel>> UniqueKernelMap;
1975 Kernel getUniqueKernelFor(Function &
F);
1978 Kernel getUniqueKernelFor(Instruction &
I) {
1979 return getUniqueKernelFor(*
I.getFunction());
1984 bool rewriteDeviceCodeStateMachine();
2000 template <
typename RemarkKind,
typename RemarkCallBack>
2001 void emitRemark(Instruction *
I, StringRef RemarkName,
2002 RemarkCallBack &&RemarkCB)
const {
2004 auto &ORE = OREGetter(
F);
2008 return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
I))
2009 <<
" [" << RemarkName <<
"]";
2013 [&]() {
return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
I)); });
2017 template <
typename RemarkKind,
typename RemarkCallBack>
2018 void emitRemark(Function *
F, StringRef RemarkName,
2019 RemarkCallBack &&RemarkCB)
const {
2020 auto &ORE = OREGetter(
F);
2024 return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
F))
2025 <<
" [" << RemarkName <<
"]";
2029 [&]() {
return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
F)); });
2036 SmallVectorImpl<Function *> &SCC;
2040 CallGraphUpdater &CGUpdater;
2043 OptimizationRemarkGetter OREGetter;
2046 OMPInformationCache &OMPInfoCache;
2052 bool runAttributor(
bool IsModulePass) {
2056 registerAAs(IsModulePass);
2061 <<
" functions, result: " <<
Changed <<
".\n");
2063 if (
Changed == ChangeStatus::CHANGED)
2064 OMPInfoCache.invalidateAnalyses();
2066 return Changed == ChangeStatus::CHANGED;
2073 void registerAAs(
bool IsModulePass);
2078 static void registerAAsForFunction(Attributor &A,
const Function &
F);
2082 if (OMPInfoCache.CGSCC && !OMPInfoCache.CGSCC->empty() &&
2083 !OMPInfoCache.CGSCC->contains(&
F))
2088 std::optional<Kernel> &CachedKernel = UniqueKernelMap[&
F];
2090 return *CachedKernel;
2097 return *CachedKernel;
2100 CachedKernel =
nullptr;
2101 if (!
F.hasLocalLinkage()) {
2104 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2105 return ORA <<
"Potentially unknown OpenMP target region caller.";
2113 auto GetUniqueKernelForUse = [&](
const Use &
U) ->
Kernel {
2116 if (
Cmp->isEquality())
2117 return getUniqueKernelFor(*Cmp);
2122 if (CB->isCallee(&U))
2123 return getUniqueKernelFor(*CB);
2125 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
2126 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_60];
2128 if (OpenMPOpt::getCallIfRegularCall(*
U.getUser(), &KernelParallelRFI))
2129 return getUniqueKernelFor(*CB);
2137 SmallPtrSet<Kernel, 2> PotentialKernels;
2138 OMPInformationCache::foreachUse(
F, [&](
const Use &U) {
2139 PotentialKernels.
insert(GetUniqueKernelForUse(U));
2143 if (PotentialKernels.
size() == 1)
2144 K = *PotentialKernels.
begin();
2147 UniqueKernelMap[&
F] =
K;
2152bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
2153 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
2154 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_60];
2157 if (!KernelParallelRFI)
2164 for (Function *
F : SCC) {
2168 bool UnknownUse =
false;
2169 bool KernelParallelUse =
false;
2170 unsigned NumDirectCalls = 0;
2173 OMPInformationCache::foreachUse(*
F, [&](Use &U) {
2175 if (CB->isCallee(&U)) {
2181 ToBeReplacedStateMachineUses.
push_back(&U);
2187 OpenMPOpt::getCallIfRegularCall(*
U.getUser(), &KernelParallelRFI);
2188 const unsigned int WrapperFunctionArgNo = 6;
2189 if (!KernelParallelUse && CI &&
2191 KernelParallelUse =
true;
2192 ToBeReplacedStateMachineUses.
push_back(&U);
2200 if (!KernelParallelUse)
2206 if (UnknownUse || NumDirectCalls != 1 ||
2207 ToBeReplacedStateMachineUses.
size() > 2) {
2208 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2209 return ORA <<
"Parallel region is used in "
2210 << (UnknownUse ?
"unknown" :
"unexpected")
2211 <<
" ways. Will not attempt to rewrite the state machine.";
2221 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2222 return ORA <<
"Parallel region is not called from a unique kernel. "
2223 "Will not attempt to rewrite the state machine.";
2235 Type *Int8Ty = Type::getInt8Ty(
M.getContext());
2237 auto *
ID =
new GlobalVariable(
2241 for (Use *U : ToBeReplacedStateMachineUses)
2243 ID,
U->get()->getType()));
2245 ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
2254struct AAICVTracker :
public StateWrapper<BooleanState, AbstractAttribute> {
2255 using Base = StateWrapper<BooleanState, AbstractAttribute>;
2256 AAICVTracker(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
2259 bool isAssumedTracked()
const {
return getAssumed(); }
2262 bool isKnownTracked()
const {
return getAssumed(); }
2265 static AAICVTracker &createForPosition(
const IRPosition &IRP, Attributor &
A);
2269 const Instruction *
I,
2270 Attributor &
A)
const {
2271 return std::nullopt;
2277 virtual std::optional<Value *>
2285 StringRef
getName()
const override {
return "AAICVTracker"; }
2288 const char *getIdAddr()
const override {
return &
ID; }
2291 static bool classof(
const AbstractAttribute *AA) {
2295 static const char ID;
2298struct AAICVTrackerFunction :
public AAICVTracker {
2299 AAICVTrackerFunction(
const IRPosition &IRP, Attributor &
A)
2300 : AAICVTracker(IRP,
A) {}
2303 const std::string getAsStr(Attributor *)
const override {
2304 return "ICVTrackerFunction";
2308 void trackStatistics()
const override {}
2312 return ChangeStatus::UNCHANGED;
2317 InternalControlVar::ICV___last>
2318 ICVReplacementValuesMap;
2325 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2328 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2330 auto &ValuesMap = ICVReplacementValuesMap[ICV];
2332 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
2338 if (ValuesMap.insert(std::make_pair(CI, CI->
getArgOperand(0))).second)
2339 HasChanged = ChangeStatus::CHANGED;
2345 std::optional<Value *> ReplVal = getValueForCall(
A,
I, ICV);
2346 if (ReplVal && ValuesMap.insert(std::make_pair(&
I, *ReplVal)).second)
2347 HasChanged = ChangeStatus::CHANGED;
2353 SetterRFI.foreachUse(TrackValues,
F);
2355 bool UsedAssumedInformation =
false;
2356 A.checkForAllInstructions(CallCheck, *
this, {Instruction::Call},
2357 UsedAssumedInformation,
2363 if (HasChanged == ChangeStatus::CHANGED)
2364 ValuesMap.try_emplace(Entry);
2372 std::optional<Value *> getValueForCall(Attributor &
A,
const Instruction &
I,
2376 if (!CB || CB->hasFnAttr(
"no_openmp") ||
2377 CB->hasFnAttr(
"no_openmp_routines") ||
2378 CB->hasFnAttr(
"no_openmp_constructs"))
2379 return std::nullopt;
2381 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2382 auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
2383 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2384 Function *CalledFunction = CB->getCalledFunction();
2387 if (CalledFunction ==
nullptr)
2389 if (CalledFunction == GetterRFI.Declaration)
2390 return std::nullopt;
2391 if (CalledFunction == SetterRFI.Declaration) {
2392 if (ICVReplacementValuesMap[ICV].
count(&
I))
2393 return ICVReplacementValuesMap[ICV].lookup(&
I);
2402 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2405 if (ICVTrackingAA->isAssumedTracked()) {
2406 std::optional<Value *> URV =
2407 ICVTrackingAA->getUniqueReplacementValue(ICV);
2418 std::optional<Value *>
2420 return std::nullopt;
2425 const Instruction *
I,
2426 Attributor &
A)
const override {
2427 const auto &ValuesMap = ICVReplacementValuesMap[ICV];
2428 if (ValuesMap.count(
I))
2429 return ValuesMap.lookup(
I);
2432 SmallPtrSet<const Instruction *, 16> Visited;
2435 std::optional<Value *> ReplVal;
2437 while (!Worklist.
empty()) {
2439 if (!Visited.
insert(CurrInst).second)
2447 if (ValuesMap.count(CurrInst)) {
2448 std::optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst);
2451 ReplVal = NewReplVal;
2457 if (ReplVal != NewReplVal)
2463 std::optional<Value *> NewReplVal = getValueForCall(
A, *CurrInst, ICV);
2469 ReplVal = NewReplVal;
2475 if (ReplVal != NewReplVal)
2480 if (CurrBB ==
I->getParent() && ReplVal)
2485 if (
const Instruction *Terminator = Pred->getTerminator())
2493struct AAICVTrackerFunctionReturned : AAICVTracker {
2494 AAICVTrackerFunctionReturned(
const IRPosition &IRP, Attributor &
A)
2495 : AAICVTracker(IRP,
A) {}
2498 const std::string getAsStr(Attributor *)
const override {
2499 return "ICVTrackerFunctionReturned";
2503 void trackStatistics()
const override {}
2507 return ChangeStatus::UNCHANGED;
2512 InternalControlVar::ICV___last>
2513 ICVReplacementValuesMap;
2516 std::optional<Value *>
2518 return ICVReplacementValuesMap[ICV];
2523 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2526 if (!ICVTrackingAA->isAssumedTracked())
2527 return indicatePessimisticFixpoint();
2530 std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2531 std::optional<Value *> UniqueICVValue;
2534 std::optional<Value *> NewReplVal =
2535 ICVTrackingAA->getReplacementValue(ICV, &
I,
A);
2538 if (UniqueICVValue && UniqueICVValue != NewReplVal)
2541 UniqueICVValue = NewReplVal;
2546 bool UsedAssumedInformation =
false;
2547 if (!
A.checkForAllInstructions(CheckReturnInst, *
this, {Instruction::Ret},
2548 UsedAssumedInformation,
2550 UniqueICVValue =
nullptr;
2552 if (UniqueICVValue == ReplVal)
2555 ReplVal = UniqueICVValue;
2556 Changed = ChangeStatus::CHANGED;
2563struct AAICVTrackerCallSite : AAICVTracker {
2564 AAICVTrackerCallSite(
const IRPosition &IRP, Attributor &
A)
2565 : AAICVTracker(IRP,
A) {}
2568 assert(getAnchorScope() &&
"Expected anchor function");
2572 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2574 auto ICVInfo = OMPInfoCache.ICVs[ICV];
2575 auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter];
2576 if (Getter.Declaration == getAssociatedFunction()) {
2577 AssociatedICV = ICVInfo.Kind;
2583 indicatePessimisticFixpoint();
2587 if (!ReplVal || !*ReplVal)
2588 return ChangeStatus::UNCHANGED;
2591 A.deleteAfterManifest(*getCtxI());
2593 return ChangeStatus::CHANGED;
2597 const std::string getAsStr(Attributor *)
const override {
2598 return "ICVTrackerCallSite";
2602 void trackStatistics()
const override {}
2605 std::optional<Value *> ReplVal;
2608 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2612 if (!ICVTrackingAA->isAssumedTracked())
2613 return indicatePessimisticFixpoint();
2615 std::optional<Value *> NewReplVal =
2616 ICVTrackingAA->getReplacementValue(AssociatedICV, getCtxI(),
A);
2618 if (ReplVal == NewReplVal)
2619 return ChangeStatus::UNCHANGED;
2621 ReplVal = NewReplVal;
2622 return ChangeStatus::CHANGED;
2627 std::optional<Value *>
2633struct AAICVTrackerCallSiteReturned : AAICVTracker {
2634 AAICVTrackerCallSiteReturned(
const IRPosition &IRP, Attributor &
A)
2635 : AAICVTracker(IRP,
A) {}
2638 const std::string getAsStr(Attributor *)
const override {
2639 return "ICVTrackerCallSiteReturned";
2643 void trackStatistics()
const override {}
2647 return ChangeStatus::UNCHANGED;
2652 InternalControlVar::ICV___last>
2653 ICVReplacementValuesMap;
2657 std::optional<Value *>
2659 return ICVReplacementValuesMap[ICV];
2664 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2666 DepClassTy::REQUIRED);
2669 if (!ICVTrackingAA->isAssumedTracked())
2670 return indicatePessimisticFixpoint();
2673 std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2674 std::optional<Value *> NewReplVal =
2675 ICVTrackingAA->getUniqueReplacementValue(ICV);
2677 if (ReplVal == NewReplVal)
2680 ReplVal = NewReplVal;
2681 Changed = ChangeStatus::CHANGED;
2689static bool hasFunctionEndAsUniqueSuccessor(
const BasicBlock *BB) {
2695 return hasFunctionEndAsUniqueSuccessor(
Successor);
2698struct AAExecutionDomainFunction :
public AAExecutionDomain {
2699 AAExecutionDomainFunction(
const IRPosition &IRP, Attributor &
A)
2700 : AAExecutionDomain(IRP,
A) {}
2702 ~AAExecutionDomainFunction()
override {
delete RPOT; }
2706 assert(
F &&
"Expected anchor function");
2707 RPOT =
new ReversePostOrderTraversal<Function *>(
F);
2710 const std::string getAsStr(Attributor *)
const override {
2711 unsigned TotalBlocks = 0, InitialThreadBlocks = 0, AlignedBlocks = 0;
2712 for (
auto &It : BEDMap) {
2716 InitialThreadBlocks += It.getSecond().IsExecutedByInitialThreadOnly;
2717 AlignedBlocks += It.getSecond().IsReachedFromAlignedBarrierOnly &&
2718 It.getSecond().IsReachingAlignedBarrierOnly;
2720 return "[AAExecutionDomain] " + std::to_string(InitialThreadBlocks) +
"/" +
2721 std::to_string(AlignedBlocks) +
" of " +
2722 std::to_string(TotalBlocks) +
2723 " executed by initial thread / aligned";
2727 void trackStatistics()
const override {}
2731 for (
const BasicBlock &BB : *getAnchorScope()) {
2732 if (!isExecutedByInitialThreadOnly(BB))
2734 dbgs() <<
TAG <<
" Basic block @" << getAnchorScope()->getName() <<
" "
2735 << BB.
getName() <<
" is executed by a single thread.\n";
2744 SmallPtrSet<CallBase *, 16> DeletedBarriers;
2745 auto HandleAlignedBarrier = [&](CallBase *CB) {
2746 const ExecutionDomainTy &ED = CB ? CEDMap[{CB, PRE}] : BEDMap[
nullptr];
2747 if (!ED.IsReachedFromAlignedBarrierOnly ||
2748 ED.EncounteredNonLocalSideEffect)
2750 if (!ED.EncounteredAssumes.empty() && !
A.isModulePass())
2761 DeletedBarriers.
insert(CB);
2762 A.deleteAfterManifest(*CB);
2763 ++NumBarriersEliminated;
2764 Changed = ChangeStatus::CHANGED;
2765 }
else if (!ED.AlignedBarriers.empty()) {
2766 Changed = ChangeStatus::CHANGED;
2768 ED.AlignedBarriers.end());
2769 SmallSetVector<CallBase *, 16> Visited;
2770 while (!Worklist.
empty()) {
2772 if (!Visited.
insert(LastCB))
2776 if (!hasFunctionEndAsUniqueSuccessor(LastCB->
getParent()))
2778 if (!DeletedBarriers.
count(LastCB)) {
2779 ++NumBarriersEliminated;
2780 A.deleteAfterManifest(*LastCB);
2786 const ExecutionDomainTy &LastED = CEDMap[{LastCB, PRE}];
2787 Worklist.
append(LastED.AlignedBarriers.begin(),
2788 LastED.AlignedBarriers.end());
2794 if (!ED.EncounteredAssumes.empty() && (CB || !ED.AlignedBarriers.empty()))
2795 for (
auto *AssumeCB : ED.EncounteredAssumes)
2796 A.deleteAfterManifest(*AssumeCB);
2799 for (
auto *CB : AlignedBarriers)
2800 HandleAlignedBarrier(CB);
2804 HandleAlignedBarrier(
nullptr);
2809 bool isNoOpFence(
const FenceInst &FI)
const override {
2810 return getState().isValidState() && !NonNoOpFences.count(&FI);
2816 mergeInPredecessorBarriersAndAssumptions(Attributor &
A, ExecutionDomainTy &ED,
2817 const ExecutionDomainTy &PredED);
2822 bool mergeInPredecessor(Attributor &
A, ExecutionDomainTy &ED,
2823 const ExecutionDomainTy &PredED,
2824 bool InitialEdgeOnly =
false);
2827 bool handleCallees(Attributor &
A, ExecutionDomainTy &EntryBBED);
2834 bool isExecutedByInitialThreadOnly(
const BasicBlock &BB)
const override {
2835 if (!isValidState())
2837 assert(BB.
getParent() == getAnchorScope() &&
"Block is out of scope!");
2838 return BEDMap.lookup(&BB).IsExecutedByInitialThreadOnly;
2841 bool isExecutedInAlignedRegion(Attributor &
A,
2842 const Instruction &
I)
const override {
2843 assert(
I.getFunction() == getAnchorScope() &&
2844 "Instruction is out of scope!");
2845 if (!isValidState())
2848 bool ForwardIsOk =
true;
2857 if (CB != &
I && AlignedBarriers.contains(
const_cast<CallBase *
>(CB)))
2859 const auto &It = CEDMap.find({CB, PRE});
2860 if (It == CEDMap.end())
2862 if (!It->getSecond().IsReachingAlignedBarrierOnly)
2863 ForwardIsOk =
false;
2867 if (!CurI && !BEDMap.lookup(
I.getParent()).IsReachingAlignedBarrierOnly)
2868 ForwardIsOk =
false;
2876 if (CB != &
I && AlignedBarriers.contains(
const_cast<CallBase *
>(CB)))
2878 const auto &It = CEDMap.find({CB, POST});
2879 if (It == CEDMap.end())
2881 if (It->getSecond().IsReachedFromAlignedBarrierOnly)
2894 return BEDMap.lookup(
nullptr).IsReachedFromAlignedBarrierOnly;
2896 return BEDMap.lookup(PredBB).IsReachedFromAlignedBarrierOnly;
2906 ExecutionDomainTy getExecutionDomain(
const BasicBlock &BB)
const override {
2908 "No request should be made against an invalid state!");
2909 return BEDMap.lookup(&BB);
2911 std::pair<ExecutionDomainTy, ExecutionDomainTy>
2912 getExecutionDomain(
const CallBase &CB)
const override {
2914 "No request should be made against an invalid state!");
2915 return {CEDMap.lookup({&CB, PRE}), CEDMap.lookup({&CB, POST})};
2917 ExecutionDomainTy getFunctionExecutionDomain()
const override {
2919 "No request should be made against an invalid state!");
2920 return InterProceduralED;
2926 static bool isInitialThreadOnlyEdge(Attributor &
A, CondBrInst *
Edge,
2927 BasicBlock &SuccessorBB) {
2930 if (
Edge->getSuccessor(0) != &SuccessorBB)
2934 if (!Cmp || !
Cmp->isTrueWhenEqual() || !
Cmp->isEquality())
2942 if (
C->isAllOnesValue()) {
2944 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2945 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
2946 CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr;
2949 ConstantStruct *KernelEnvC =
2951 ConstantInt *ExecModeC =
2952 KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);
2959 if (
II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_tid_x)
2964 if (
II->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x)
2972 ExecutionDomainTy InterProceduralED;
2976 DenseMap<const BasicBlock *, ExecutionDomainTy> BEDMap;
2977 DenseMap<PointerIntPair<const CallBase *, 1, Direction>, ExecutionDomainTy>
2979 SmallSetVector<CallBase *, 16> AlignedBarriers;
2981 ReversePostOrderTraversal<Function *> *RPOT =
nullptr;
2984 static bool setAndRecord(
bool &R,
bool V) {
2992 SmallPtrSet<const FenceInst *, 8> NonNoOpFences;
2995void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions(
2996 Attributor &
A, ExecutionDomainTy &ED,
const ExecutionDomainTy &PredED) {
2997 for (
auto *EA : PredED.EncounteredAssumes)
2998 ED.addAssumeInst(
A, *EA);
3000 for (
auto *AB : PredED.AlignedBarriers)
3001 ED.addAlignedBarrier(
A, *AB);
3004bool AAExecutionDomainFunction::mergeInPredecessor(
3005 Attributor &
A, ExecutionDomainTy &ED,
const ExecutionDomainTy &PredED,
3006 bool InitialEdgeOnly) {
3010 setAndRecord(ED.IsExecutedByInitialThreadOnly,
3011 InitialEdgeOnly || (PredED.IsExecutedByInitialThreadOnly &&
3012 ED.IsExecutedByInitialThreadOnly));
3014 Changed |= setAndRecord(ED.IsReachedFromAlignedBarrierOnly,
3015 ED.IsReachedFromAlignedBarrierOnly &&
3016 PredED.IsReachedFromAlignedBarrierOnly);
3017 Changed |= setAndRecord(ED.EncounteredNonLocalSideEffect,
3018 ED.EncounteredNonLocalSideEffect |
3019 PredED.EncounteredNonLocalSideEffect);
3021 if (ED.IsReachedFromAlignedBarrierOnly)
3022 mergeInPredecessorBarriersAndAssumptions(
A, ED, PredED);
3024 ED.clearAssumeInstAndAlignedBarriers();
3028bool AAExecutionDomainFunction::handleCallees(Attributor &
A,
3029 ExecutionDomainTy &EntryBBED) {
3031 auto PredForCallSite = [&](AbstractCallSite ACS) {
3032 const auto *EDAA =
A.getAAFor<AAExecutionDomain>(
3034 DepClassTy::OPTIONAL);
3035 if (!EDAA || !EDAA->getState().isValidState())
3038 EDAA->getExecutionDomain(*
cast<CallBase>(ACS.getInstruction())));
3042 ExecutionDomainTy ExitED;
3043 bool AllCallSitesKnown;
3044 if (
A.checkForAllCallSites(PredForCallSite, *
this,
3046 AllCallSitesKnown)) {
3047 for (
const auto &[CSInED, CSOutED] : CallSiteEDs) {
3048 mergeInPredecessor(
A, EntryBBED, CSInED);
3049 ExitED.IsReachingAlignedBarrierOnly &=
3050 CSOutED.IsReachingAlignedBarrierOnly;
3057 EntryBBED.IsExecutedByInitialThreadOnly =
false;
3058 EntryBBED.IsReachedFromAlignedBarrierOnly =
true;
3059 EntryBBED.EncounteredNonLocalSideEffect =
false;
3060 ExitED.IsReachingAlignedBarrierOnly =
false;
3062 EntryBBED.IsExecutedByInitialThreadOnly =
false;
3063 EntryBBED.IsReachedFromAlignedBarrierOnly =
false;
3064 EntryBBED.EncounteredNonLocalSideEffect =
true;
3065 ExitED.IsReachingAlignedBarrierOnly =
false;
3070 auto &FnED = BEDMap[
nullptr];
3071 Changed |= setAndRecord(FnED.IsReachedFromAlignedBarrierOnly,
3072 FnED.IsReachedFromAlignedBarrierOnly &
3073 EntryBBED.IsReachedFromAlignedBarrierOnly);
3074 Changed |= setAndRecord(FnED.IsReachingAlignedBarrierOnly,
3075 FnED.IsReachingAlignedBarrierOnly &
3076 ExitED.IsReachingAlignedBarrierOnly);
3077 Changed |= setAndRecord(FnED.IsExecutedByInitialThreadOnly,
3078 EntryBBED.IsExecutedByInitialThreadOnly);
3082ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &
A) {
3089 auto HandleAlignedBarrier = [&](CallBase &CB, ExecutionDomainTy &ED) {
3090 Changed |= AlignedBarriers.insert(&CB);
3092 auto &CallInED = CEDMap[{&CB, PRE}];
3093 Changed |= mergeInPredecessor(
A, CallInED, ED);
3094 CallInED.IsReachingAlignedBarrierOnly =
true;
3096 ED.EncounteredNonLocalSideEffect =
false;
3097 ED.IsReachedFromAlignedBarrierOnly =
true;
3099 ED.clearAssumeInstAndAlignedBarriers();
3100 ED.addAlignedBarrier(
A, CB);
3101 auto &CallOutED = CEDMap[{&CB, POST}];
3102 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3106 A.getAAFor<AAIsDead>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
3113 for (
auto &RIt : *RPOT) {
3116 bool IsEntryBB = &BB == &EntryBB;
3119 bool AlignedBarrierLastInBlock = IsEntryBB && IsKernel;
3120 bool IsExplicitlyAligned = IsEntryBB && IsKernel;
3121 ExecutionDomainTy ED;
3128 if (LivenessAA && LivenessAA->isAssumedDead(&BB))
3132 if (LivenessAA && LivenessAA->isEdgeDead(PredBB, &BB))
3134 bool InitialEdgeOnly = isInitialThreadOnlyEdge(
3136 mergeInPredecessor(
A, ED, BEDMap[PredBB], InitialEdgeOnly);
3142 for (Instruction &
I : BB) {
3143 bool UsedAssumedInformation;
3144 if (
A.isAssumedDead(
I, *
this, LivenessAA, UsedAssumedInformation,
3145 false, DepClassTy::OPTIONAL,
3153 ED.addAssumeInst(
A, *AI);
3157 if (
II->isAssumeLikeIntrinsic())
3162 if (!ED.EncounteredNonLocalSideEffect) {
3164 if (ED.IsReachedFromAlignedBarrierOnly)
3169 case AtomicOrdering::NotAtomic:
3171 case AtomicOrdering::Unordered:
3173 case AtomicOrdering::Monotonic:
3175 case AtomicOrdering::Acquire:
3177 case AtomicOrdering::Release:
3179 case AtomicOrdering::AcquireRelease:
3181 case AtomicOrdering::SequentiallyConsistent:
3185 NonNoOpFences.insert(FI);
3190 bool IsAlignedBarrier =
3194 AlignedBarrierLastInBlock &= IsNoSync;
3195 IsExplicitlyAligned &= IsNoSync;
3201 if (IsAlignedBarrier) {
3202 HandleAlignedBarrier(*CB, ED);
3203 AlignedBarrierLastInBlock =
true;
3204 IsExplicitlyAligned =
true;
3210 if (!ED.EncounteredNonLocalSideEffect &&
3212 ED.EncounteredNonLocalSideEffect =
true;
3214 ED.IsReachedFromAlignedBarrierOnly =
false;
3222 auto &CallInED = CEDMap[{CB, PRE}];
3223 Changed |= mergeInPredecessor(
A, CallInED, ED);
3229 if (!IsNoSync && Callee && !
Callee->isDeclaration()) {
3230 const auto *EDAA =
A.getAAFor<AAExecutionDomain>(
3232 if (EDAA && EDAA->getState().isValidState()) {
3233 const auto &CalleeED = EDAA->getFunctionExecutionDomain();
3234 ED.IsReachedFromAlignedBarrierOnly =
3235 CalleeED.IsReachedFromAlignedBarrierOnly;
3236 AlignedBarrierLastInBlock = ED.IsReachedFromAlignedBarrierOnly;
3237 if (IsNoSync || !CalleeED.IsReachedFromAlignedBarrierOnly)
3238 ED.EncounteredNonLocalSideEffect |=
3239 CalleeED.EncounteredNonLocalSideEffect;
3241 ED.EncounteredNonLocalSideEffect =
3242 CalleeED.EncounteredNonLocalSideEffect;
3243 if (!CalleeED.IsReachingAlignedBarrierOnly) {
3245 setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3248 if (CalleeED.IsReachedFromAlignedBarrierOnly)
3249 mergeInPredecessorBarriersAndAssumptions(
A, ED, CalleeED);
3250 auto &CallOutED = CEDMap[{CB, POST}];
3251 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3256 ED.IsReachedFromAlignedBarrierOnly =
false;
3257 Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3260 AlignedBarrierLastInBlock &= ED.IsReachedFromAlignedBarrierOnly;
3262 auto &CallOutED = CEDMap[{CB, POST}];
3263 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3266 if (!
I.mayHaveSideEffects() && !
I.mayReadFromMemory())
3272 const auto *MemAA =
A.getAAFor<AAMemoryLocation>(
3280 if (MemAA && MemAA->getState().isValidState() &&
3281 MemAA->checkForAllAccessesToMemoryKind(
3286 auto &InfoCache =
A.getInfoCache();
3287 if (!
I.mayHaveSideEffects() && InfoCache.isOnlyUsedByAssume(
I))
3291 if (LI->hasMetadata(LLVMContext::MD_invariant_load))
3294 if (!ED.EncounteredNonLocalSideEffect &&
3296 ED.EncounteredNonLocalSideEffect =
true;
3299 bool IsEndAndNotReachingAlignedBarriersOnly =
false;
3301 !BB.getTerminator()->getNumSuccessors()) {
3303 Changed |= mergeInPredecessor(
A, InterProceduralED, ED);
3305 auto &FnED = BEDMap[
nullptr];
3306 if (IsKernel && !IsExplicitlyAligned)
3307 FnED.IsReachingAlignedBarrierOnly =
false;
3308 Changed |= mergeInPredecessor(
A, FnED, ED);
3310 if (!FnED.IsReachingAlignedBarrierOnly) {
3311 IsEndAndNotReachingAlignedBarriersOnly =
true;
3312 SyncInstWorklist.
push_back(BB.getTerminator());
3313 auto &BBED = BEDMap[&BB];
3314 Changed |= setAndRecord(BBED.IsReachingAlignedBarrierOnly,
false);
3318 ExecutionDomainTy &StoredED = BEDMap[&BB];
3319 ED.IsReachingAlignedBarrierOnly = StoredED.IsReachingAlignedBarrierOnly &
3320 !IsEndAndNotReachingAlignedBarriersOnly;
3326 if (ED.IsExecutedByInitialThreadOnly !=
3327 StoredED.IsExecutedByInitialThreadOnly ||
3328 ED.IsReachedFromAlignedBarrierOnly !=
3329 StoredED.IsReachedFromAlignedBarrierOnly ||
3330 ED.EncounteredNonLocalSideEffect !=
3331 StoredED.EncounteredNonLocalSideEffect)
3335 StoredED = std::move(ED);
3340 SmallSetVector<BasicBlock *, 16> Visited;
3341 while (!SyncInstWorklist.
empty()) {
3344 bool HitAlignedBarrierOrKnownEnd =
false;
3349 auto &CallOutED = CEDMap[{CB, POST}];
3350 Changed |= setAndRecord(CallOutED.IsReachingAlignedBarrierOnly,
false);
3351 auto &CallInED = CEDMap[{CB, PRE}];
3352 HitAlignedBarrierOrKnownEnd =
3353 AlignedBarriers.count(CB) || !CallInED.IsReachingAlignedBarrierOnly;
3354 if (HitAlignedBarrierOrKnownEnd)
3356 Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3358 if (HitAlignedBarrierOrKnownEnd)
3362 if (LivenessAA && LivenessAA->isEdgeDead(PredBB, SyncBB))
3364 if (!Visited.
insert(PredBB))
3366 auto &PredED = BEDMap[PredBB];
3367 if (setAndRecord(PredED.IsReachingAlignedBarrierOnly,
false)) {
3369 SyncInstWorklist.
push_back(PredBB->getTerminator());
3372 if (SyncBB != &EntryBB)
3375 setAndRecord(InterProceduralED.IsReachingAlignedBarrierOnly,
false);
3378 return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
3383struct AAHeapToShared :
public StateWrapper<BooleanState, AbstractAttribute> {
3384 using Base = StateWrapper<BooleanState, AbstractAttribute>;
3385 AAHeapToShared(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
3388 static AAHeapToShared &createForPosition(
const IRPosition &IRP,
3392 virtual bool isAssumedHeapToShared(CallBase &CB)
const = 0;
3396 virtual bool isAssumedHeapToSharedRemovedFree(CallBase &CB)
const = 0;
3399 StringRef
getName()
const override {
return "AAHeapToShared"; }
3402 const char *getIdAddr()
const override {
return &
ID; }
3406 static bool classof(
const AbstractAttribute *AA) {
3411 static const char ID;
3414struct AAHeapToSharedFunction :
public AAHeapToShared {
3415 AAHeapToSharedFunction(
const IRPosition &IRP, Attributor &
A)
3416 : AAHeapToShared(IRP,
A) {}
3418 const std::string getAsStr(Attributor *)
const override {
3419 return "[AAHeapToShared] " + std::to_string(MallocCalls.size()) +
3420 " malloc calls eligible.";
3424 void trackStatistics()
const override {}
3428 void findPotentialRemovedFreeCalls(Attributor &
A) {
3429 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3430 auto &FreeRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
3432 PotentialRemovedFreeCalls.clear();
3434 for (CallBase *CB : MallocCalls) {
3436 for (
auto *U : CB->
users()) {
3438 if (
C &&
C->getCalledFunction() == FreeRFI.Declaration)
3442 if (FreeCalls.
size() != 1)
3445 PotentialRemovedFreeCalls.insert(FreeCalls.
front());
3451 indicatePessimisticFixpoint();
3455 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3456 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3457 if (!RFI.Declaration)
3461 [](
const IRPosition &,
const AbstractAttribute *,
3462 bool &) -> std::optional<Value *> {
return nullptr; };
3465 const OMPInformationCache::RuntimeFunctionInfo::UseVector *
Uses =
3466 RFI.getUseVector(*
F);
3470 for (Use *U : *
Uses)
3472 MallocCalls.insert(CB);
3477 findPotentialRemovedFreeCalls(
A);
3480 bool isAssumedHeapToShared(CallBase &CB)
const override {
3481 return isValidState() && MallocCalls.count(&CB);
3484 bool isAssumedHeapToSharedRemovedFree(CallBase &CB)
const override {
3485 return isValidState() && PotentialRemovedFreeCalls.count(&CB);
3489 if (MallocCalls.empty())
3490 return ChangeStatus::UNCHANGED;
3492 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3493 auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
3497 DepClassTy::OPTIONAL);
3500 for (CallBase *CB : MallocCalls) {
3502 if (HS &&
HS->isAssumedHeapToStack(*CB))
3507 for (
auto *U : CB->
users()) {
3509 if (
C &&
C->getCalledFunction() == FreeCall.Declaration)
3512 if (FreeCalls.
size() != 1)
3519 <<
" with shared memory."
3520 <<
" Shared memory usage is limited to "
3526 <<
" with " << AllocSize->getZExtValue()
3527 <<
" bytes of shared memory\n");
3532 Type *Int8Ty = Type::getInt8Ty(
M->getContext());
3533 Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue());
3534 auto *SharedMem =
new GlobalVariable(
3538 static_cast<unsigned>(AddressSpace::Shared));
3540 SharedMem, PointerType::getUnqual(
M->getContext()));
3542 auto Remark = [&](OptimizationRemark
OR) {
3543 return OR <<
"Replaced globalized variable with "
3544 <<
ore::NV(
"SharedMemory", AllocSize->getZExtValue())
3545 << (AllocSize->isOne() ?
" byte " :
" bytes ")
3546 <<
"of shared memory.";
3548 A.emitRemark<OptimizationRemark>(CB,
"OMP111",
Remark);
3550 MaybeAlign Alignment = CB->getRetAlign();
3552 "HeapToShared on allocation without alignment attribute");
3553 SharedMem->setAlignment(*Alignment);
3556 A.deleteAfterManifest(*CB);
3557 A.deleteAfterManifest(*FreeCalls.
front());
3559 SharedMemoryUsed += AllocSize->getZExtValue();
3560 NumBytesMovedToSharedMemory = SharedMemoryUsed;
3561 Changed = ChangeStatus::CHANGED;
3568 if (MallocCalls.empty())
3569 return indicatePessimisticFixpoint();
3570 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3571 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3572 if (!RFI.Declaration)
3573 return ChangeStatus::UNCHANGED;
3577 auto NumMallocCalls = MallocCalls.size();
3580 for (User *U : RFI.Declaration->
users()) {
3582 if (CB->getCaller() !=
F)
3584 if (!MallocCalls.count(CB))
3587 MallocCalls.remove(CB);
3590 const auto *ED =
A.getAAFor<AAExecutionDomain>(
3592 if (!ED || !ED->isExecutedByInitialThreadOnly(*CB))
3593 MallocCalls.remove(CB);
3597 findPotentialRemovedFreeCalls(
A);
3599 if (NumMallocCalls != MallocCalls.size())
3600 return ChangeStatus::CHANGED;
3602 return ChangeStatus::UNCHANGED;
3606 SmallSetVector<CallBase *, 4> MallocCalls;
3608 SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls;
3610 unsigned SharedMemoryUsed = 0;
3613struct AAKernelInfo :
public StateWrapper<KernelInfoState, AbstractAttribute> {
3614 using Base = StateWrapper<KernelInfoState, AbstractAttribute>;
3615 AAKernelInfo(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
3619 static bool requiresCalleeForCallBase() {
return false; }
3622 void trackStatistics()
const override {}
3625 const std::string getAsStr(Attributor *)
const override {
3626 if (!isValidState())
3628 return std::string(SPMDCompatibilityTracker.isAssumed() ?
"SPMD"
3630 std::string(SPMDCompatibilityTracker.isAtFixpoint() ?
" [FIX]"
3632 std::string(
" #PRs: ") +
3633 (ReachedKnownParallelRegions.isValidState()
3634 ? std::to_string(ReachedKnownParallelRegions.size())
3636 ", #Unknown PRs: " +
3637 (ReachedUnknownParallelRegions.isValidState()
3638 ? std::to_string(ReachedUnknownParallelRegions.size())
3640 ", #Reaching Kernels: " +
3641 (ReachingKernelEntries.isValidState()
3642 ? std::to_string(ReachingKernelEntries.size())
3645 (ParallelLevels.isValidState()
3646 ? std::to_string(ParallelLevels.size())
3648 ", NestedPar: " + (NestedParallelism ?
"yes" :
"no");
3652 static AAKernelInfo &createForPosition(
const IRPosition &IRP, Attributor &
A);
3655 StringRef
getName()
const override {
return "AAKernelInfo"; }
3658 const char *getIdAddr()
const override {
return &
ID; }
3661 static bool classof(
const AbstractAttribute *AA) {
3665 static const char ID;
3670struct AAKernelInfoFunction : AAKernelInfo {
3671 AAKernelInfoFunction(
const IRPosition &IRP, Attributor &
A)
3672 : AAKernelInfo(IRP,
A) {}
3674 SmallPtrSet<Instruction *, 4> GuardedInstructions;
3676 SmallPtrSetImpl<Instruction *> &getGuardedInstructions() {
3677 return GuardedInstructions;
3680 void setConfigurationOfKernelEnvironment(ConstantStruct *ConfigC) {
3682 KernelEnvC, ConfigC, {KernelInfo::ConfigurationIdx});
3683 assert(NewKernelEnvC &&
"Failed to create new kernel environment");
3687#define KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MEMBER) \
3688 void set##MEMBER##OfKernelEnvironment(ConstantInt *NewVal) { \
3689 ConstantStruct *ConfigC = \
3690 KernelInfo::getConfigurationFromKernelEnvironment(KernelEnvC); \
3691 Constant *NewConfigC = ConstantFoldInsertValueInstruction( \
3692 ConfigC, NewVal, {KernelInfo::MEMBER##Idx}); \
3693 assert(NewConfigC && "Failed to create new configuration environment"); \
3694 setConfigurationOfKernelEnvironment(cast<ConstantStruct>(NewConfigC)); \
3705#undef KERNEL_ENVIRONMENT_CONFIGURATION_SETTER
3712 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3716 OMPInformationCache::RuntimeFunctionInfo &InitRFI =
3717 OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
3718 OMPInformationCache::RuntimeFunctionInfo &DeinitRFI =
3719 OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];
3723 auto StoreCallBase = [](
Use &U,
3724 OMPInformationCache::RuntimeFunctionInfo &RFI,
3726 CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI);
3728 "Unexpected use of __kmpc_target_init or __kmpc_target_deinit!");
3730 "Multiple uses of __kmpc_target_init or __kmpc_target_deinit!");
3736 StoreCallBase(U, InitRFI, KernelInitCB);
3740 DeinitRFI.foreachUse(
3742 StoreCallBase(U, DeinitRFI, KernelDeinitCB);
3748 if (!KernelInitCB || !KernelDeinitCB)
3752 ReachingKernelEntries.insert(Fn);
3753 IsKernelEntry =
true;
3761 KernelConfigurationSimplifyCB =
3763 bool &UsedAssumedInformation) -> std::optional<Constant *> {
3764 if (!isAtFixpoint()) {
3767 UsedAssumedInformation =
true;
3773 A.registerGlobalVariableSimplificationCallback(
3774 *KernelEnvGV, KernelConfigurationSimplifyCB);
3777 bool CanChangeToSPMD = OMPInfoCache.runtimeFnsAvailable(
3778 {OMPRTL___kmpc_get_hardware_thread_id_in_block,
3779 OMPRTL___kmpc_barrier_simple_spmd});
3783 KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);
3788 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
3792 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3794 setExecModeOfKernelEnvironment(AssumedExecModeC);
3801 setMinThreadsOfKernelEnvironment(ConstantInt::get(Int32Ty, MinThreads));
3803 setMaxThreadsOfKernelEnvironment(ConstantInt::get(Int32Ty,
MaxThreads));
3804 auto [MinTeams, MaxTeams] =
3807 setMinTeamsOfKernelEnvironment(ConstantInt::get(Int32Ty, MinTeams));
3809 setMaxTeamsOfKernelEnvironment(ConstantInt::get(Int32Ty, MaxTeams));
3812 KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(KernelEnvC);
3813 ConstantInt *AssumedMayUseNestedParallelismC = ConstantInt::get(
3815 setMayUseNestedParallelismOfKernelEnvironment(
3816 AssumedMayUseNestedParallelismC);
3820 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
3823 ConstantInt::get(UseGenericStateMachineC->
getIntegerType(),
false);
3824 setUseGenericStateMachineOfKernelEnvironment(
3825 AssumedUseGenericStateMachineC);
3831 if (!OMPInfoCache.RFIs[RFKind].Declaration)
3833 A.registerVirtualUseCallback(*OMPInfoCache.RFIs[RFKind].Declaration, CB);
3837 auto AddDependence = [](
Attributor &
A,
const AAKernelInfo *KI,
3854 if (SPMDCompatibilityTracker.isValidState())
3855 return AddDependence(
A,
this, QueryingAA);
3857 if (!ReachedKnownParallelRegions.isValidState())
3858 return AddDependence(
A,
this, QueryingAA);
3864 RegisterVirtualUse(OMPRTL___kmpc_get_hardware_num_threads_in_block,
3865 CustomStateMachineUseCB);
3866 RegisterVirtualUse(OMPRTL___kmpc_get_warp_size, CustomStateMachineUseCB);
3867 RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_generic,
3868 CustomStateMachineUseCB);
3869 RegisterVirtualUse(OMPRTL___kmpc_kernel_parallel,
3870 CustomStateMachineUseCB);
3871 RegisterVirtualUse(OMPRTL___kmpc_kernel_end_parallel,
3872 CustomStateMachineUseCB);
3876 if (SPMDCompatibilityTracker.isAtFixpoint())
3883 if (!SPMDCompatibilityTracker.isValidState())
3884 return AddDependence(
A,
this, QueryingAA);
3887 RegisterVirtualUse(OMPRTL___kmpc_get_hardware_thread_id_in_block,
3896 if (!SPMDCompatibilityTracker.isValidState())
3897 return AddDependence(
A,
this, QueryingAA);
3898 if (SPMDCompatibilityTracker.empty())
3899 return AddDependence(
A,
this, QueryingAA);
3900 if (!mayContainParallelRegion())
3901 return AddDependence(
A,
this, QueryingAA);
3904 RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_spmd, SPMDBarrierUseCB);
3908 static std::string sanitizeForGlobalName(std::string S) {
3912 return !((C >=
'a' && C <=
'z') || (C >=
'A' && C <=
'Z') ||
3913 (C >=
'0' && C <=
'9') || C ==
'_');
3924 if (!KernelInitCB || !KernelDeinitCB)
3925 return ChangeStatus::UNCHANGED;
3929 bool HasBuiltStateMachine =
true;
3930 if (!changeToSPMDMode(
A,
Changed)) {
3932 HasBuiltStateMachine = buildCustomStateMachine(
A,
Changed);
3934 HasBuiltStateMachine =
false;
3938 ConstantStruct *ExistingKernelEnvC =
3940 ConstantInt *OldUseGenericStateMachineVal =
3941 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
3942 ExistingKernelEnvC);
3943 if (!HasBuiltStateMachine)
3944 setUseGenericStateMachineOfKernelEnvironment(
3945 OldUseGenericStateMachineVal);
3948 GlobalVariable *KernelEnvGV =
3952 Changed = ChangeStatus::CHANGED;
3958 void insertInstructionGuardsHelper(Attributor &
A) {
3959 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3961 auto CreateGuardedRegion = [&](
Instruction *RegionStartI,
3963 LoopInfo *LI =
nullptr;
3964 DominatorTree *DT =
nullptr;
3965 MemorySSAUpdater *MSU =
nullptr;
3995 DT, LI, MSU,
"region.guarded.end");
3998 MSU,
"region.barrier");
4001 DT, LI, MSU,
"region.exit");
4003 SplitBlock(ParentBB, RegionStartI, DT, LI, MSU,
"region.guarded");
4006 "Expected a different CFG");
4009 ParentBB, ParentBB->
getTerminator(), DT, LI, MSU,
"region.check.tid");
4012 A.registerManifestAddedBasicBlock(*RegionEndBB);
4013 A.registerManifestAddedBasicBlock(*RegionBarrierBB);
4014 A.registerManifestAddedBasicBlock(*RegionExitBB);
4015 A.registerManifestAddedBasicBlock(*RegionStartBB);
4016 A.registerManifestAddedBasicBlock(*RegionCheckTidBB);
4018 bool HasBroadcastValues =
false;
4021 for (Instruction &
I : *RegionStartBB) {
4023 for (Use &U :
I.uses()) {
4029 if (OutsideUses.
empty())
4032 HasBroadcastValues =
true;
4036 auto *SharedMem =
new GlobalVariable(
4037 M,
I.getType(),
false,
4039 sanitizeForGlobalName(
4040 (
I.getName() +
".guarded.output.alloc").str()),
4042 static_cast<unsigned>(AddressSpace::Shared));
4045 new StoreInst(&
I, SharedMem,
4048 LoadInst *LoadI =
new LoadInst(
4049 I.getType(), SharedMem,
I.getName() +
".guarded.output.load",
4053 for (Use *U : OutsideUses)
4054 A.changeUseAfterManifest(*U, *LoadI);
4057 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4062 OpenMPIRBuilder::LocationDescription Loc(
4063 InsertPointTy(ParentBB, ParentBB->
end()),
DL);
4065 uint32_t SrcLocStrSize;
4074 OpenMPIRBuilder::LocationDescription LocRegionCheckTid(
4075 InsertPointTy(RegionCheckTidBB, RegionCheckTidBB->
end()),
DL);
4077 FunctionCallee HardwareTidFn =
4079 M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
4083 OMPInfoCache.setCallingConvention(HardwareTidFn, Tid);
4085 OMPInfoCache.OMPBuilder.
Builder
4086 .
CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB)
4091 FunctionCallee BarrierFn =
4093 M, OMPRTL___kmpc_barrier_simple_spmd);
4099 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4102 if (HasBroadcastValues) {
4107 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4111 auto &AllocSharedRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
4112 SmallPtrSet<BasicBlock *, 8> Visited;
4113 for (Instruction *GuardedI : SPMDCompatibilityTracker) {
4115 if (!Visited.
insert(BB).second)
4121 while (++IP != IPEnd) {
4122 if (!IP->mayHaveSideEffects() && !IP->mayReadFromMemory())
4125 if (OpenMPOpt::getCallIfRegularCall(*
I, &AllocSharedRFI))
4127 if (!
I->user_empty() || !SPMDCompatibilityTracker.contains(
I)) {
4128 LastEffect =
nullptr;
4135 for (
auto &Reorder : Reorders)
4136 Reorder.first->moveBefore(Reorder.second->getIterator());
4141 for (Instruction *GuardedI : SPMDCompatibilityTracker) {
4143 auto *CalleeAA =
A.lookupAAFor<AAKernelInfo>(
4146 assert(CalleeAA !=
nullptr &&
"Expected Callee AAKernelInfo");
4149 if (CalleeAAFunction.getGuardedInstructions().contains(GuardedI))
4152 Instruction *GuardedRegionStart =
nullptr, *GuardedRegionEnd =
nullptr;
4153 for (Instruction &
I : *BB) {
4156 if (SPMDCompatibilityTracker.contains(&
I)) {
4157 CalleeAAFunction.getGuardedInstructions().insert(&
I);
4158 if (GuardedRegionStart)
4159 GuardedRegionEnd = &
I;
4161 GuardedRegionStart = GuardedRegionEnd = &
I;
4168 if (GuardedRegionStart) {
4170 std::make_pair(GuardedRegionStart, GuardedRegionEnd));
4171 GuardedRegionStart =
nullptr;
4172 GuardedRegionEnd =
nullptr;
4177 for (
auto &GR : GuardedRegions)
4178 CreateGuardedRegion(GR.first, GR.second);
4181 void forceSingleThreadPerWorkgroupHelper(Attributor &
A) {
4190 auto &Ctx = getAnchorValue().getContext();
4197 KernelInitCB->
getNextNode(),
"main.thread.user_code");
4202 A.registerManifestAddedBasicBlock(*InitBB);
4203 A.registerManifestAddedBasicBlock(*UserCodeBB);
4204 A.registerManifestAddedBasicBlock(*ReturnBB);
4213 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4214 FunctionCallee ThreadIdInBlockFn =
4216 M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
4219 CallInst *ThreadIdInBlock =
4221 OMPInfoCache.setCallingConvention(ThreadIdInBlockFn, ThreadIdInBlock);
4227 ConstantInt::get(ThreadIdInBlock->
getType(), 0),
4228 "thread.is_main", InitBB);
4234 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4236 if (!SPMDCompatibilityTracker.isAssumed()) {
4237 for (Instruction *NonCompatibleI : SPMDCompatibilityTracker) {
4238 if (!NonCompatibleI)
4243 if (OMPInfoCache.RTLFunctions.contains(CB->getCalledFunction()))
4246 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
4247 ORA <<
"Value has potential side effects preventing SPMD-mode "
4250 ORA <<
". Add `[[omp::assume(\"ompx_spmd_amenable\")]]` to "
4251 "the called function to override";
4255 A.emitRemark<OptimizationRemarkAnalysis>(NonCompatibleI,
"OMP121",
4259 << *NonCompatibleI <<
"\n");
4271 Kernel = CB->getCaller();
4276 ConstantStruct *ExistingKernelEnvC =
4279 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC);
4285 Changed = ChangeStatus::CHANGED;
4289 if (mayContainParallelRegion())
4290 insertInstructionGuardsHelper(
A);
4292 forceSingleThreadPerWorkgroupHelper(
A);
4297 "Initially non-SPMD kernel has SPMD exec mode!");
4298 setExecModeOfKernelEnvironment(
4302 ++NumOpenMPTargetRegionKernelsSPMD;
4304 auto Remark = [&](OptimizationRemark
OR) {
4305 return OR <<
"Transformed generic-mode kernel to SPMD-mode.";
4307 A.emitRemark<OptimizationRemark>(KernelInitCB,
"OMP120",
Remark);
4317 if (!ReachedKnownParallelRegions.isValidState())
4320 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4321 if (!OMPInfoCache.runtimeFnsAvailable(
4322 {OMPRTL___kmpc_get_hardware_num_threads_in_block,
4323 OMPRTL___kmpc_get_warp_size, OMPRTL___kmpc_barrier_simple_generic,
4324 OMPRTL___kmpc_kernel_parallel, OMPRTL___kmpc_kernel_end_parallel}))
4327 ConstantStruct *ExistingKernelEnvC =
4334 ConstantInt *UseStateMachineC =
4335 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
4336 ExistingKernelEnvC);
4337 ConstantInt *ModeC =
4338 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC);
4343 if (UseStateMachineC->
isZero() ||
4347 Changed = ChangeStatus::CHANGED;
4350 setUseGenericStateMachineOfKernelEnvironment(
4357 if (!mayContainParallelRegion()) {
4358 ++NumOpenMPTargetRegionKernelsWithoutStateMachine;
4360 auto Remark = [&](OptimizationRemark
OR) {
4361 return OR <<
"Removing unused state machine from generic-mode kernel.";
4363 A.emitRemark<OptimizationRemark>(KernelInitCB,
"OMP130",
Remark);
4369 if (ReachedUnknownParallelRegions.empty()) {
4370 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback;
4372 auto Remark = [&](OptimizationRemark
OR) {
4373 return OR <<
"Rewriting generic-mode kernel with a customized state "
4376 A.emitRemark<OptimizationRemark>(KernelInitCB,
"OMP131",
Remark);
4378 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback;
4380 auto Remark = [&](OptimizationRemarkAnalysis
OR) {
4381 return OR <<
"Generic-mode kernel is executed with a customized state "
4382 "machine that requires a fallback.";
4384 A.emitRemark<OptimizationRemarkAnalysis>(KernelInitCB,
"OMP132",
Remark);
4387 for (CallBase *UnknownParallelRegionCB : ReachedUnknownParallelRegions) {
4388 if (!UnknownParallelRegionCB)
4390 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
4391 return ORA <<
"Call may contain unknown parallel regions. Use "
4392 <<
"`[[omp::assume(\"omp_no_parallelism\")]]` to "
4395 A.emitRemark<OptimizationRemarkAnalysis>(UnknownParallelRegionCB,
4430 auto &Ctx = getAnchorValue().getContext();
4434 BasicBlock *InitBB = KernelInitCB->getParent();
4436 KernelInitCB->getNextNode(),
"thread.user_code.check");
4440 Ctx,
"worker_state_machine.begin",
Kernel, UserCodeEntryBB);
4442 Ctx,
"worker_state_machine.finished",
Kernel, UserCodeEntryBB);
4444 Ctx,
"worker_state_machine.is_active.check",
Kernel, UserCodeEntryBB);
4447 Kernel, UserCodeEntryBB);
4450 Kernel, UserCodeEntryBB);
4452 Ctx,
"worker_state_machine.done.barrier",
Kernel, UserCodeEntryBB);
4453 A.registerManifestAddedBasicBlock(*InitBB);
4454 A.registerManifestAddedBasicBlock(*UserCodeEntryBB);
4455 A.registerManifestAddedBasicBlock(*IsWorkerCheckBB);
4456 A.registerManifestAddedBasicBlock(*StateMachineBeginBB);
4457 A.registerManifestAddedBasicBlock(*StateMachineFinishedBB);
4458 A.registerManifestAddedBasicBlock(*StateMachineIsActiveCheckBB);
4459 A.registerManifestAddedBasicBlock(*StateMachineIfCascadeCurrentBB);
4460 A.registerManifestAddedBasicBlock(*StateMachineEndParallelBB);
4461 A.registerManifestAddedBasicBlock(*StateMachineDoneBarrierBB);
4463 const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
4469 ConstantInt::getAllOnesValue(KernelInitCB->getType()),
4470 "thread.is_worker", InitBB);
4475 FunctionCallee BlockHwSizeFn =
4477 M, OMPRTL___kmpc_get_hardware_num_threads_in_block);
4478 FunctionCallee WarpSizeFn =
4480 M, OMPRTL___kmpc_get_warp_size);
4481 CallInst *BlockHwSize =
4483 OMPInfoCache.setCallingConvention(BlockHwSizeFn, BlockHwSize);
4485 CallInst *WarpSize =
4487 OMPInfoCache.setCallingConvention(WarpSizeFn, WarpSize);
4490 BlockHwSize, WarpSize,
"block.size", IsWorkerCheckBB);
4494 "thread.is_main_or_worker", IsWorkerCheckBB);
4497 StateMachineFinishedBB, IsWorkerCheckBB);
4500 const DataLayout &
DL =
M.getDataLayout();
4501 Type *VoidPtrTy = PointerType::getUnqual(Ctx);
4503 new AllocaInst(VoidPtrTy,
DL.getAllocaAddrSpace(),
nullptr,
4508 OpenMPIRBuilder::LocationDescription(
4509 IRBuilder<>::InsertPoint(StateMachineBeginBB,
4510 StateMachineBeginBB->
end()),
4513 Value *Ident = KernelInfo::getIdentFromKernelEnvironment(KernelEnvC);
4514 Value *GTid = KernelInitCB;
4516 FunctionCallee BarrierFn =
4518 M, OMPRTL___kmpc_barrier_simple_generic);
4521 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4525 (
unsigned int)AddressSpace::Generic) {
4526 WorkFnAI =
new AddrSpaceCastInst(
4527 WorkFnAI, PointerType::get(Ctx, (
unsigned int)AddressSpace::Generic),
4528 WorkFnAI->
getName() +
".generic", StateMachineBeginBB);
4532 FunctionCallee KernelParallelFn =
4534 M, OMPRTL___kmpc_kernel_parallel);
4536 KernelParallelFn, {WorkFnAI},
"worker.is_active", StateMachineBeginBB);
4537 OMPInfoCache.setCallingConvention(KernelParallelFn, IsActiveWorker);
4539 Instruction *WorkFn =
new LoadInst(VoidPtrTy, WorkFnAI,
"worker.work_fn",
4540 StateMachineBeginBB);
4543 FunctionType *ParallelRegionFnTy = FunctionType::get(
4544 Type::getVoidTy(Ctx), {Type::getInt16Ty(Ctx), Type::getInt32Ty(Ctx)},
4550 StateMachineBeginBB);
4551 IsDone->setDebugLoc(DLoc);
4553 StateMachineIsActiveCheckBB, StateMachineBeginBB)
4554 ->setDebugLoc(DLoc);
4557 StateMachineDoneBarrierBB, StateMachineIsActiveCheckBB)
4558 ->setDebugLoc(DLoc);
4563 const unsigned int WrapperFunctionArgNo = 6;
4568 for (
int I = 0,
E = ReachedKnownParallelRegions.size();
I <
E; ++
I) {
4569 auto *CB = ReachedKnownParallelRegions[
I];
4571 CB->getArgOperand(WrapperFunctionArgNo)->stripPointerCasts());
4573 Ctx,
"worker_state_machine.parallel_region.execute",
Kernel,
4574 StateMachineEndParallelBB);
4576 ->setDebugLoc(DLoc);
4578 ->setDebugLoc(DLoc);
4582 Kernel, StateMachineEndParallelBB);
4583 A.registerManifestAddedBasicBlock(*PRExecuteBB);
4584 A.registerManifestAddedBasicBlock(*PRNextBB);
4589 if (
I + 1 <
E || !ReachedUnknownParallelRegions.empty()) {
4592 "worker.check_parallel_region", StateMachineIfCascadeCurrentBB);
4600 StateMachineIfCascadeCurrentBB)
4601 ->setDebugLoc(DLoc);
4602 StateMachineIfCascadeCurrentBB = PRNextBB;
4608 if (!ReachedUnknownParallelRegions.empty()) {
4609 StateMachineIfCascadeCurrentBB->
setName(
4610 "worker_state_machine.parallel_region.fallback.execute");
4612 StateMachineIfCascadeCurrentBB)
4613 ->setDebugLoc(DLoc);
4616 StateMachineIfCascadeCurrentBB)
4617 ->setDebugLoc(DLoc);
4619 FunctionCallee EndParallelFn =
4621 M, OMPRTL___kmpc_kernel_end_parallel);
4622 CallInst *EndParallel =
4624 OMPInfoCache.setCallingConvention(EndParallelFn, EndParallel);
4627 ->setDebugLoc(DLoc);
4630 ->setDebugLoc(DLoc);
4632 ->setDebugLoc(DLoc);
4640 KernelInfoState StateBefore = getState();
4646 struct UpdateKernelEnvCRAII {
4647 AAKernelInfoFunction &AA;
4649 UpdateKernelEnvCRAII(AAKernelInfoFunction &AA) : AA(AA) {}
4651 ~UpdateKernelEnvCRAII() {
4655 ConstantStruct *ExistingKernelEnvC =
4658 if (!AA.isValidState()) {
4659 AA.KernelEnvC = ExistingKernelEnvC;
4663 if (!AA.ReachedKnownParallelRegions.isValidState())
4664 AA.setUseGenericStateMachineOfKernelEnvironment(
4665 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
4666 ExistingKernelEnvC));
4668 if (!AA.SPMDCompatibilityTracker.isValidState())
4669 AA.setExecModeOfKernelEnvironment(
4670 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC));
4672 ConstantInt *MayUseNestedParallelismC =
4673 KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(
4675 ConstantInt *NewMayUseNestedParallelismC = ConstantInt::get(
4676 MayUseNestedParallelismC->
getIntegerType(), AA.NestedParallelism);
4677 AA.setMayUseNestedParallelismOfKernelEnvironment(
4678 NewMayUseNestedParallelismC);
4688 if (!
I.mayWriteToMemory())
4691 const auto *UnderlyingObjsAA =
A.getAAFor<AAUnderlyingObjects>(
4693 DepClassTy::OPTIONAL);
4694 auto *
HS =
A.getAAFor<AAHeapToStack>(
4696 DepClassTy::OPTIONAL);
4697 if (UnderlyingObjsAA &&
4698 UnderlyingObjsAA->forallUnderlyingObjects([&](
Value &Obj) {
4699 if (AA::isAssumedThreadLocalObject(A, Obj, *this))
4703 auto *CB = dyn_cast<CallBase>(&Obj);
4704 return CB && HS && HS->isAssumedHeapToStack(*CB);
4710 SPMDCompatibilityTracker.insert(&
I);
4714 bool UsedAssumedInformationInCheckRWInst =
false;
4715 if (!SPMDCompatibilityTracker.isAtFixpoint())
4716 if (!
A.checkForAllReadWriteInstructions(
4717 CheckRWInst, *
this, UsedAssumedInformationInCheckRWInst))
4718 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4720 bool UsedAssumedInformationFromReachingKernels =
false;
4721 if (!IsKernelEntry) {
4722 updateParallelLevels(
A);
4724 bool AllReachingKernelsKnown =
true;
4725 updateReachingKernelEntries(
A, AllReachingKernelsKnown);
4726 UsedAssumedInformationFromReachingKernels = !AllReachingKernelsKnown;
4728 if (!SPMDCompatibilityTracker.empty()) {
4729 if (!ParallelLevels.isValidState())
4730 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4731 else if (!ReachingKernelEntries.isValidState())
4732 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4738 for (
auto *
Kernel : ReachingKernelEntries) {
4739 auto *CBAA =
A.getAAFor<AAKernelInfo>(
4741 if (CBAA && CBAA->SPMDCompatibilityTracker.isValidState() &&
4742 CBAA->SPMDCompatibilityTracker.isAssumed())
4746 if (!CBAA || !CBAA->SPMDCompatibilityTracker.isAtFixpoint())
4747 UsedAssumedInformationFromReachingKernels =
true;
4749 if (SPMD != 0 &&
Generic != 0)
4750 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4756 bool AllParallelRegionStatesWereFixed =
true;
4757 bool AllSPMDStatesWereFixed =
true;
4760 auto *CBAA =
A.getAAFor<AAKernelInfo>(
4764 getState() ^= CBAA->getState();
4765 AllSPMDStatesWereFixed &= CBAA->SPMDCompatibilityTracker.isAtFixpoint();
4766 AllParallelRegionStatesWereFixed &=
4767 CBAA->ReachedKnownParallelRegions.isAtFixpoint();
4768 AllParallelRegionStatesWereFixed &=
4769 CBAA->ReachedUnknownParallelRegions.isAtFixpoint();
4773 bool UsedAssumedInformationInCheckCallInst =
false;
4774 if (!
A.checkForAllCallLikeInstructions(
4775 CheckCallInst, *
this, UsedAssumedInformationInCheckCallInst)) {
4777 <<
"Failed to visit all call-like instructions!\n";);
4778 return indicatePessimisticFixpoint();
4783 if (!UsedAssumedInformationInCheckCallInst &&
4784 AllParallelRegionStatesWereFixed) {
4785 ReachedKnownParallelRegions.indicateOptimisticFixpoint();
4786 ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
4791 if (!UsedAssumedInformationInCheckRWInst &&
4792 !UsedAssumedInformationInCheckCallInst &&
4793 !UsedAssumedInformationFromReachingKernels && AllSPMDStatesWereFixed)
4794 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
4796 return StateBefore == getState() ? ChangeStatus::UNCHANGED
4797 : ChangeStatus::CHANGED;
4802 void updateReachingKernelEntries(Attributor &
A,
4803 bool &AllReachingKernelsKnown) {
4804 auto PredCallSite = [&](AbstractCallSite ACS) {
4807 assert(Caller &&
"Caller is nullptr");
4809 auto *CAA =
A.getOrCreateAAFor<AAKernelInfo>(
4811 if (CAA && CAA->ReachingKernelEntries.isValidState()) {
4812 ReachingKernelEntries ^= CAA->ReachingKernelEntries;
4818 ReachingKernelEntries.indicatePessimisticFixpoint();
4823 if (!
A.checkForAllCallSites(PredCallSite, *
this,
4825 AllReachingKernelsKnown))
4826 ReachingKernelEntries.indicatePessimisticFixpoint();
4830 void updateParallelLevels(Attributor &
A) {
4831 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4832 OMPInformationCache::RuntimeFunctionInfo &Parallel60RFI =
4833 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_60];
4835 auto PredCallSite = [&](AbstractCallSite ACS) {
4838 assert(Caller &&
"Caller is nullptr");
4842 if (CAA && CAA->ParallelLevels.isValidState()) {
4848 if (Caller == Parallel60RFI.Declaration) {
4849 ParallelLevels.indicatePessimisticFixpoint();
4853 ParallelLevels ^= CAA->ParallelLevels;
4860 ParallelLevels.indicatePessimisticFixpoint();
4865 bool AllCallSitesKnown =
true;
4866 if (!
A.checkForAllCallSites(PredCallSite, *
this,
4869 ParallelLevels.indicatePessimisticFixpoint();
4876struct AAKernelInfoCallSite : AAKernelInfo {
4877 AAKernelInfoCallSite(
const IRPosition &IRP, Attributor &
A)
4878 : AAKernelInfo(IRP,
A) {}
4882 AAKernelInfo::initialize(
A);
4885 auto *AssumptionAA =
A.getAAFor<AAAssumptionInfo>(
4889 if (AssumptionAA && AssumptionAA->hasAssumption(
"ompx_spmd_amenable")) {
4890 indicateOptimisticFixpoint();
4898 indicateOptimisticFixpoint();
4907 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4908 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
4909 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
4911 if (!Callee || !
A.isFunctionIPOAmendable(*Callee)) {
4915 if (!AssumptionAA ||
4916 !(AssumptionAA->hasAssumption(
"omp_no_openmp") ||
4917 AssumptionAA->hasAssumption(
"omp_no_parallelism")))
4918 ReachedUnknownParallelRegions.insert(&CB);
4922 if (!SPMDCompatibilityTracker.isAtFixpoint()) {
4923 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4924 SPMDCompatibilityTracker.insert(&CB);
4929 indicateOptimisticFixpoint();
4935 if (NumCallees > 1) {
4936 indicatePessimisticFixpoint();
4943 case OMPRTL___kmpc_is_spmd_exec_mode:
4944 case OMPRTL___kmpc_distribute_static_fini:
4945 case OMPRTL___kmpc_for_static_fini:
4946 case OMPRTL___kmpc_global_thread_num:
4947 case OMPRTL___kmpc_get_hardware_num_threads_in_block:
4948 case OMPRTL___kmpc_get_hardware_num_blocks:
4949 case OMPRTL___kmpc_single:
4950 case OMPRTL___kmpc_end_single:
4951 case OMPRTL___kmpc_master:
4952 case OMPRTL___kmpc_end_master:
4953 case OMPRTL___kmpc_barrier:
4954 case OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2:
4955 case OMPRTL___kmpc_gpu_xteam_reduce_nowait:
4956 case OMPRTL___kmpc_error:
4957 case OMPRTL___kmpc_flush:
4958 case OMPRTL___kmpc_get_hardware_thread_id_in_block:
4959 case OMPRTL___kmpc_get_warp_size:
4960 case OMPRTL_omp_get_thread_num:
4961 case OMPRTL_omp_get_num_threads:
4962 case OMPRTL_omp_get_max_threads:
4963 case OMPRTL_omp_in_parallel:
4964 case OMPRTL_omp_get_dynamic:
4965 case OMPRTL_omp_get_cancellation:
4966 case OMPRTL_omp_get_nested:
4967 case OMPRTL_omp_get_schedule:
4968 case OMPRTL_omp_get_thread_limit:
4969 case OMPRTL_omp_get_supported_active_levels:
4970 case OMPRTL_omp_get_max_active_levels:
4971 case OMPRTL_omp_get_level:
4972 case OMPRTL_omp_get_ancestor_thread_num:
4973 case OMPRTL_omp_get_team_size:
4974 case OMPRTL_omp_get_active_level:
4975 case OMPRTL_omp_in_final:
4976 case OMPRTL_omp_get_proc_bind:
4977 case OMPRTL_omp_get_num_places:
4978 case OMPRTL_omp_get_num_procs:
4979 case OMPRTL_omp_get_place_proc_ids:
4980 case OMPRTL_omp_get_place_num:
4981 case OMPRTL_omp_get_partition_num_places:
4982 case OMPRTL_omp_get_partition_place_nums:
4983 case OMPRTL_omp_get_wtime:
4985 case OMPRTL___kmpc_distribute_static_init_4:
4986 case OMPRTL___kmpc_distribute_static_init_4u:
4987 case OMPRTL___kmpc_distribute_static_init_8:
4988 case OMPRTL___kmpc_distribute_static_init_8u:
4989 case OMPRTL___kmpc_for_static_init_4:
4990 case OMPRTL___kmpc_for_static_init_4u:
4991 case OMPRTL___kmpc_for_static_init_8:
4992 case OMPRTL___kmpc_for_static_init_8u: {
4994 unsigned ScheduleArgOpNo = 2;
4995 auto *ScheduleTypeCI =
4997 unsigned ScheduleTypeVal =
4998 ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0;
5000 case OMPScheduleType::UnorderedStatic:
5001 case OMPScheduleType::UnorderedStaticChunked:
5002 case OMPScheduleType::OrderedDistribute:
5003 case OMPScheduleType::OrderedDistributeChunked:
5006 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5007 SPMDCompatibilityTracker.insert(&CB);
5011 case OMPRTL___kmpc_target_init:
5014 case OMPRTL___kmpc_target_deinit:
5015 KernelDeinitCB = &CB;
5017 case OMPRTL___kmpc_parallel_60:
5018 if (!handleParallel60(
A, CB))
5019 indicatePessimisticFixpoint();
5021 case OMPRTL___kmpc_omp_task:
5023 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5024 SPMDCompatibilityTracker.insert(&CB);
5025 ReachedUnknownParallelRegions.insert(&CB);
5027 case OMPRTL___kmpc_alloc_shared:
5028 case OMPRTL___kmpc_free_shared:
5031 case OMPRTL___kmpc_distribute_static_loop_4:
5032 case OMPRTL___kmpc_distribute_static_loop_4u:
5033 case OMPRTL___kmpc_distribute_static_loop_8:
5034 case OMPRTL___kmpc_distribute_static_loop_8u:
5035 case OMPRTL___kmpc_distribute_for_static_loop_4:
5036 case OMPRTL___kmpc_distribute_for_static_loop_4u:
5037 case OMPRTL___kmpc_distribute_for_static_loop_8:
5038 case OMPRTL___kmpc_distribute_for_static_loop_8u:
5039 case OMPRTL___kmpc_for_static_loop_4:
5040 case OMPRTL___kmpc_for_static_loop_4u:
5041 case OMPRTL___kmpc_for_static_loop_8:
5042 case OMPRTL___kmpc_for_static_loop_8u:
5046 ReachedUnknownParallelRegions.insert(&CB);
5051 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5052 SPMDCompatibilityTracker.insert(&CB);
5057 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5058 SPMDCompatibilityTracker.insert(&CB);
5064 indicateOptimisticFixpoint();
5068 A.getAAFor<AACallEdges>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
5069 if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) {
5070 CheckCallee(getAssociatedFunction(), 1);
5073 const auto &OptimisticEdges = AACE->getOptimisticEdges();
5074 for (
auto *Callee : OptimisticEdges) {
5075 CheckCallee(Callee, OptimisticEdges.size());
5086 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
5087 KernelInfoState StateBefore = getState();
5089 auto CheckCallee = [&](
Function *
F,
int NumCallees) {
5090 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(
F);
5094 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
5097 A.getAAFor<AAKernelInfo>(*
this, FnPos, DepClassTy::REQUIRED);
5099 return indicatePessimisticFixpoint();
5100 if (getState() == FnAA->getState())
5101 return ChangeStatus::UNCHANGED;
5102 getState() = FnAA->getState();
5103 return ChangeStatus::CHANGED;
5106 return indicatePessimisticFixpoint();
5109 if (It->getSecond() == OMPRTL___kmpc_parallel_60) {
5110 if (!handleParallel60(
A, CB))
5111 return indicatePessimisticFixpoint();
5112 return StateBefore == getState() ? ChangeStatus::UNCHANGED
5113 : ChangeStatus::CHANGED;
5119 (It->getSecond() == OMPRTL___kmpc_alloc_shared ||
5120 It->getSecond() == OMPRTL___kmpc_free_shared) &&
5121 "Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call");
5123 auto *HeapToStackAA =
A.getAAFor<AAHeapToStack>(
5125 auto *HeapToSharedAA =
A.getAAFor<AAHeapToShared>(
5133 case OMPRTL___kmpc_alloc_shared:
5134 if ((!HeapToStackAA || !HeapToStackAA->isAssumedHeapToStack(CB)) &&
5135 (!HeapToSharedAA || !HeapToSharedAA->isAssumedHeapToShared(CB)))
5136 SPMDCompatibilityTracker.insert(&CB);
5138 case OMPRTL___kmpc_free_shared:
5139 if ((!HeapToStackAA ||
5140 !HeapToStackAA->isAssumedHeapToStackRemovedFree(CB)) &&
5142 !HeapToSharedAA->isAssumedHeapToSharedRemovedFree(CB)))
5143 SPMDCompatibilityTracker.insert(&CB);
5146 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5147 SPMDCompatibilityTracker.insert(&CB);
5149 return ChangeStatus::CHANGED;
5153 A.getAAFor<AACallEdges>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
5154 if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) {
5155 if (Function *
F = getAssociatedFunction())
5158 const auto &OptimisticEdges = AACE->getOptimisticEdges();
5159 for (
auto *Callee : OptimisticEdges) {
5160 CheckCallee(Callee, OptimisticEdges.size());
5166 return StateBefore == getState() ? ChangeStatus::UNCHANGED
5167 : ChangeStatus::CHANGED;
5172 bool handleParallel60(Attributor &
A, CallBase &CB) {
5173 const unsigned int NonWrapperFunctionArgNo = 5;
5174 const unsigned int WrapperFunctionArgNo = 6;
5175 auto ParallelRegionOpArgNo = SPMDCompatibilityTracker.isAssumed()
5176 ? NonWrapperFunctionArgNo
5177 : WrapperFunctionArgNo;
5181 if (!ParallelRegion)
5184 ReachedKnownParallelRegions.insert(&CB);
5186 auto *FnAA =
A.getAAFor<AAKernelInfo>(
5188 NestedParallelism |= !FnAA || !FnAA->getState().isValidState() ||
5189 !FnAA->ReachedKnownParallelRegions.empty() ||
5190 !FnAA->ReachedKnownParallelRegions.isValidState() ||
5191 !FnAA->ReachedUnknownParallelRegions.isValidState() ||
5192 !FnAA->ReachedUnknownParallelRegions.empty();
5197struct AAFoldRuntimeCall
5198 :
public StateWrapper<BooleanState, AbstractAttribute> {
5199 using Base = StateWrapper<BooleanState, AbstractAttribute>;
5201 AAFoldRuntimeCall(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
5204 void trackStatistics()
const override {}
5207 static AAFoldRuntimeCall &createForPosition(
const IRPosition &IRP,
5211 StringRef
getName()
const override {
return "AAFoldRuntimeCall"; }
5214 const char *getIdAddr()
const override {
return &
ID; }
5218 static bool classof(
const AbstractAttribute *AA) {
5222 static const char ID;
5225struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
5226 AAFoldRuntimeCallCallSiteReturned(
const IRPosition &IRP, Attributor &
A)
5227 : AAFoldRuntimeCall(IRP,
A) {}
5230 const std::string getAsStr(Attributor *)
const override {
5231 if (!isValidState())
5234 std::string Str(
"simplified value: ");
5236 if (!SimplifiedValue)
5237 return Str + std::string(
"none");
5239 if (!*SimplifiedValue)
5240 return Str + std::string(
"nullptr");
5243 return Str + std::to_string(CI->getSExtValue());
5245 return Str + std::string(
"unknown");
5250 indicatePessimisticFixpoint();
5254 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
5255 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
5256 assert(It != OMPInfoCache.RuntimeFunctionIDMap.end() &&
5257 "Expected a known OpenMP runtime function");
5259 RFKind = It->getSecond();
5262 A.registerSimplificationCallback(
5264 [&](
const IRPosition &IRP,
const AbstractAttribute *AA,
5265 bool &UsedAssumedInformation) -> std::optional<Value *> {
5266 assert((isValidState() || SimplifiedValue ==
nullptr) &&
5267 "Unexpected invalid state!");
5269 if (!isAtFixpoint()) {
5270 UsedAssumedInformation =
true;
5272 A.recordDependence(*
this, *AA, DepClassTy::OPTIONAL);
5274 return SimplifiedValue;
5281 case OMPRTL___kmpc_is_spmd_exec_mode:
5284 case OMPRTL___kmpc_parallel_level:
5287 case OMPRTL___kmpc_get_hardware_num_threads_in_block:
5288 Changed =
Changed | foldKernelFnAttribute(
A,
"omp_target_thread_limit");
5290 case OMPRTL___kmpc_get_hardware_num_blocks:
5303 if (SimplifiedValue && *SimplifiedValue) {
5306 A.deleteAfterManifest(
I);
5309 auto Remark = [&](OptimizationRemark
OR) {
5311 return OR <<
"Replacing OpenMP runtime call "
5313 <<
ore::NV(
"FoldedValue",
C->getZExtValue()) <<
".";
5314 return OR <<
"Replacing OpenMP runtime call "
5319 A.emitRemark<OptimizationRemark>(CB,
"OMP180",
Remark);
5322 << **SimplifiedValue <<
"\n");
5324 Changed = ChangeStatus::CHANGED;
5331 SimplifiedValue =
nullptr;
5332 return AAFoldRuntimeCall::indicatePessimisticFixpoint();
5338 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5340 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
5341 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
5342 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5345 if (!CallerKernelInfoAA ||
5346 !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5347 return indicatePessimisticFixpoint();
5349 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5351 DepClassTy::REQUIRED);
5353 if (!AA || !AA->isValidState()) {
5354 SimplifiedValue =
nullptr;
5355 return indicatePessimisticFixpoint();
5358 if (AA->SPMDCompatibilityTracker.isAssumed()) {
5359 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5364 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5365 ++KnownNonSPMDCount;
5367 ++AssumedNonSPMDCount;
5371 if ((AssumedSPMDCount + KnownSPMDCount) &&
5372 (AssumedNonSPMDCount + KnownNonSPMDCount))
5373 return indicatePessimisticFixpoint();
5375 auto &Ctx = getAnchorValue().getContext();
5376 if (KnownSPMDCount || AssumedSPMDCount) {
5377 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
5378 "Expected only SPMD kernels!");
5381 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx),
true);
5382 }
else if (KnownNonSPMDCount || AssumedNonSPMDCount) {
5383 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
5384 "Expected only non-SPMD kernels!");
5387 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx),
false);
5392 assert(!SimplifiedValue &&
"SimplifiedValue should be none");
5395 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5396 : ChangeStatus::CHANGED;
5401 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5403 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5406 if (!CallerKernelInfoAA ||
5407 !CallerKernelInfoAA->ParallelLevels.isValidState())
5408 return indicatePessimisticFixpoint();
5410 if (!CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5411 return indicatePessimisticFixpoint();
5413 if (CallerKernelInfoAA->ReachingKernelEntries.empty()) {
5414 assert(!SimplifiedValue &&
5415 "SimplifiedValue should keep none at this point");
5416 return ChangeStatus::UNCHANGED;
5419 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
5420 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
5421 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5423 DepClassTy::REQUIRED);
5424 if (!AA || !AA->SPMDCompatibilityTracker.isValidState())
5425 return indicatePessimisticFixpoint();
5427 if (AA->SPMDCompatibilityTracker.isAssumed()) {
5428 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5433 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5434 ++KnownNonSPMDCount;
5436 ++AssumedNonSPMDCount;
5440 if ((AssumedSPMDCount + KnownSPMDCount) &&
5441 (AssumedNonSPMDCount + KnownNonSPMDCount))
5442 return indicatePessimisticFixpoint();
5444 auto &Ctx = getAnchorValue().getContext();
5448 if (AssumedSPMDCount || KnownSPMDCount) {
5449 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
5450 "Expected only SPMD kernels!");
5451 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 1);
5453 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
5454 "Expected only non-SPMD kernels!");
5455 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 0);
5457 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5458 : ChangeStatus::CHANGED;
5461 ChangeStatus foldKernelFnAttribute(Attributor &
A, llvm::StringRef Attr) {
5463 int32_t CurrentAttrValue = -1;
5464 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5466 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5469 if (!CallerKernelInfoAA ||
5470 !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5471 return indicatePessimisticFixpoint();
5474 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5475 int32_t NextAttrVal =
K->getFnAttributeAsParsedInteger(Attr, -1);
5477 if (NextAttrVal == -1 ||
5478 (CurrentAttrValue != -1 && CurrentAttrValue != NextAttrVal))
5479 return indicatePessimisticFixpoint();
5480 CurrentAttrValue = NextAttrVal;
5483 if (CurrentAttrValue != -1) {
5484 auto &Ctx = getAnchorValue().getContext();
5486 ConstantInt::get(Type::getInt32Ty(Ctx), CurrentAttrValue);
5488 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5489 : ChangeStatus::CHANGED;
5495 std::optional<Value *> SimplifiedValue;
5505 auto &RFI = OMPInfoCache.RFIs[RF];
5506 RFI.foreachUse(SCC, [&](Use &U, Function &
F) {
5507 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &RFI);
5510 A.getOrCreateAAFor<AAFoldRuntimeCall>(
5512 DepClassTy::NONE,
false,
5518void OpenMPOpt::registerAAs(
bool IsModulePass) {
5528 A.getOrCreateAAFor<AAKernelInfo>(
5530 DepClassTy::NONE,
false,
5534 OMPInformationCache::RuntimeFunctionInfo &InitRFI =
5535 OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
5536 InitRFI.foreachUse(SCC, CreateKernelInfoCB);
5538 registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode);
5539 registerFoldRuntimeCall(OMPRTL___kmpc_parallel_level);
5540 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_threads_in_block);
5541 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_blocks);
5546 for (
int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) {
5549 auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
5552 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
5559 A.getOrCreateAAFor<AAICVTracker>(CBPos);
5563 GetterRFI.foreachUse(SCC, CreateAA);
5572 for (
auto *
F : SCC) {
5573 if (
F->isDeclaration())
5579 if (
F->hasLocalLinkage()) {
5581 const auto *CB = dyn_cast<CallBase>(U.getUser());
5582 return CB && CB->isCallee(&U) &&
5583 A.isRunOn(const_cast<Function *>(CB->getCaller()));
5587 registerAAsForFunction(
A, *
F);
5591void OpenMPOpt::registerAAsForFunction(Attributor &
A,
const Function &
F) {
5592 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
5595 A.getOrCreateAAFor<AAExecutionDomain>(FPos);
5596 if (
F.hasFnAttribute(Attribute::Convergent))
5597 A.getOrCreateAAFor<AANonConvergent>(FPos);
5599 bool FunctionUsesSharedAlloc =
false;
5601 const OMPInformationCache::RuntimeFunctionInfo::UseVector *SharedAllocUses =
5602 OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared].getUseVector(
5604 FunctionUsesSharedAlloc = SharedAllocUses && !SharedAllocUses->
empty();
5606 bool HasHeapToStackCandidate =
false;
5607 const TargetLibraryInfo *TLI =
nullptr;
5611 bool UsedAssumedInformation =
false;
5614 A.getOrCreateAAFor<AAAddressSpace>(
5621 TLI =
A.getInfoCache().getTargetLibraryInfoForFunction(
F);
5622 HasHeapToStackCandidate =
5626 A.getOrCreateAAFor<AAIndirectCallInfo>(
5631 A.getOrCreateAAFor<AAAddressSpace>(
5640 if (
II->getIntrinsicID() == Intrinsic::assume) {
5641 A.getOrCreateAAFor<AAPotentialValues>(
5648 if (FunctionUsesSharedAlloc)
5649 A.getOrCreateAAFor<AAHeapToShared>(FPos);
5650 if (HasHeapToStackCandidate)
5651 A.getOrCreateAAFor<AAHeapToStack>(FPos);
5654const char AAICVTracker::ID = 0;
5655const char AAKernelInfo::ID = 0;
5657const char AAHeapToShared::ID = 0;
5658const char AAFoldRuntimeCall::ID = 0;
5660AAICVTracker &AAICVTracker::createForPosition(
const IRPosition &IRP,
5662 AAICVTracker *AA =
nullptr;
5670 AA =
new (
A.Allocator) AAICVTrackerFunctionReturned(IRP,
A);
5673 AA =
new (
A.Allocator) AAICVTrackerCallSiteReturned(IRP,
A);
5676 AA =
new (
A.Allocator) AAICVTrackerCallSite(IRP,
A);
5679 AA =
new (
A.Allocator) AAICVTrackerFunction(IRP,
A);
5688 AAExecutionDomainFunction *
AA =
nullptr;
5698 "AAExecutionDomain can only be created for function position!");
5700 AA =
new (
A.Allocator) AAExecutionDomainFunction(IRP,
A);
5707AAHeapToShared &AAHeapToShared::createForPosition(
const IRPosition &IRP,
5709 AAHeapToSharedFunction *
AA =
nullptr;
5719 "AAHeapToShared can only be created for function position!");
5721 AA =
new (
A.Allocator) AAHeapToSharedFunction(IRP,
A);
5728AAKernelInfo &AAKernelInfo::createForPosition(
const IRPosition &IRP,
5730 AAKernelInfo *AA =
nullptr;
5740 AA =
new (
A.Allocator) AAKernelInfoCallSite(IRP,
A);
5743 AA =
new (
A.Allocator) AAKernelInfoFunction(IRP,
A);
5750AAFoldRuntimeCall &AAFoldRuntimeCall::createForPosition(
const IRPosition &IRP,
5752 AAFoldRuntimeCall *AA =
nullptr;
5761 llvm_unreachable(
"KernelInfo can only be created for call site position!");
5763 AA =
new (
A.Allocator) AAFoldRuntimeCallCallSiteReturned(IRP,
A);
5784 if (Kernels.contains(&
F))
5786 return !
F.use_empty();
5793 return ORA <<
"Could not internalize function. "
5794 <<
"Some optimizations may not be possible. [OMP140]";
5806 if (!
F.isDeclaration() && !Kernels.contains(&
F) && IsCalled(
F) &&
5810 }
else if (!
F.hasLocalLinkage() && !
F.hasFnAttribute(Attribute::Cold)) {
5823 if (!
F.isDeclaration() && !InternalizedMap.
lookup(&
F)) {
5825 Functions.insert(&
F);
5843 OMPInformationCache InfoCache(M, AG, Allocator,
nullptr, PostLink);
5845 unsigned MaxFixpointIterations =
5857 return F.hasFnAttribute(
"kernel");
5862 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache,
A);
5868 if (!
F.isDeclaration() && !Kernels.contains(&
F) &&
5869 !
F.hasFnAttribute(Attribute::NoInline))
5870 F.addFnAttr(Attribute::AlwaysInline);
5900 Module &M = *
C.begin()->getFunction().getParent();
5922 OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
5923 &Functions, PostLink);
5925 unsigned MaxFixpointIterations =
5939 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache,
A);
5940 bool Changed = OMPOpt.run(
false);
5959 if (
F.hasKernelCallingConv()) {
5964 ++NumOpenMPTargetRegionKernels;
5967 ++NumNonOpenMPTargetRegionKernels;
5974 Metadata *MD = M.getModuleFlag(
"openmp");
5982 Metadata *MD = M.getModuleFlag(
"openmp-device");
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu next use AMDGPU Next Use Analysis Printer
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Expand Atomic instructions
static cl::opt< unsigned > SetFixpointIterations("attributor-max-iterations", cl::Hidden, cl::desc("Maximal number of fixpoint iterations."), cl::init(32))
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file provides interfaces used to manipulate a call graph, regardless if it is a "old style" Call...
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file defines the DenseSet and SmallDenseSet classes.
This file defines an array type that can be indexed using scoped enum values.
static void emitRemark(const Function &F, OptimizationRemarkEmitter &ORE, bool Skip)
Loop::LoopBounds::Direction Direction
Machine Check Debug Module
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
This file defines constans and helpers used when dealing with OpenMP.
This file defines constans that will be used by both host and device compilation.
static constexpr auto TAG
static cl::opt< bool > HideMemoryTransferLatency("openmp-hide-memory-transfer-latency", cl::desc("[WIP] Tries to hide the latency of host to device memory" " transfers"), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptStateMachineRewrite("openmp-opt-disable-state-machine-rewrite", cl::desc("Disable OpenMP optimizations that replace the state machine."), cl::Hidden, cl::init(false))
static cl::opt< bool > EnableParallelRegionMerging("openmp-opt-enable-merging", cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintModuleAfterOptimizations("openmp-opt-print-module-after", cl::desc("Print the current module after OpenMP optimizations."), cl::Hidden, cl::init(false))
#define KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MEMBER)
#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX)
#define KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MEMBER)
static cl::opt< bool > PrintOpenMPKernels("openmp-print-gpu-kernels", cl::init(false), cl::Hidden)
static cl::opt< bool > DisableOpenMPOptFolding("openmp-opt-disable-folding", cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintModuleBeforeOptimizations("openmp-opt-print-module-before", cl::desc("Print the current module before OpenMP optimizations."), cl::Hidden, cl::init(false))
static cl::opt< unsigned > SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden, cl::desc("Maximal number of attributor iterations."), cl::init(256))
static cl::opt< bool > DisableInternalization("openmp-opt-disable-internalization", cl::desc("Disable function internalization."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintICVValues("openmp-print-icv-values", cl::init(false), cl::Hidden)
static cl::opt< bool > DisableOpenMPOptimizations("openmp-opt-disable", cl::desc("Disable OpenMP specific optimizations."), cl::Hidden, cl::init(false))
static cl::opt< unsigned > SharedMemoryLimit("openmp-opt-shared-limit", cl::Hidden, cl::desc("Maximum amount of shared memory to use."), cl::init(std::numeric_limits< unsigned >::max()))
static cl::opt< bool > EnableVerboseRemarks("openmp-opt-verbose-remarks", cl::desc("Enables more verbose remarks."), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptDeglobalization("openmp-opt-disable-deglobalization", cl::desc("Disable OpenMP optimizations involving deglobalization."), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptBarrierElimination("openmp-opt-disable-barrier-elimination", cl::desc("Disable OpenMP optimizations that eliminate barriers."), cl::Hidden, cl::init(false))
static cl::opt< bool > DeduceICVValues("openmp-deduce-icv-values", cl::init(false), cl::Hidden)
#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX)
#define KERNEL_ENVIRONMENT_GETTER(MEMBER, RETURNTYPE)
static cl::opt< bool > DisableOpenMPOptSPMDization("openmp-opt-disable-spmdization", cl::desc("Disable OpenMP optimizations involving SPMD-ization."), cl::Hidden, cl::init(false))
static cl::opt< bool > AlwaysInlineDeviceFunctions("openmp-opt-inline-device", cl::desc("Inline all applicable functions on the device."), cl::Hidden, cl::init(false))
FunctionAnalysisManager FAM
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static StringRef getName(Value *V)
Remove Loads Into Fake Uses
std::pair< BasicBlock *, BasicBlock * > Edge
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static const int BlockSize
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, const llvm::StringTable &StandardNames, VectorLibrary VecLib)
Initialize the set of available library functions based on the specified target triple.
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
size_t size() const
Get the array size.
iterator begin()
Instruction iterator methods.
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
reverse_iterator rbegin()
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
InstListType::reverse_iterator reverse_iterator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
void setCallingConv(CallingConv::ID CC)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool doesNotAccessMemory(unsigned OpNo) const
LLVM_ABI bool isIndirectCall() const
Return true if the callsite is an indirect call.
bool isCallee(Value::const_user_iterator UI) const
Determine whether the passed iterator points to the callee operand's Use.
Value * getArgOperand(unsigned i) const
void setArgOperand(unsigned i, Value *v)
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
unsigned arg_size() const
AttributeList getAttributes() const
Return the attributes for this call.
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool isArgOperand(const Use *U) const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
LLVM_ABI Function * getCaller()
Helper to get the caller (the parent function).
Wrapper to unify "old style" CallGraph and "new style" LazyCallGraph.
void initialize(LazyCallGraph &LCG, LazyCallGraph::SCC &SCC, CGSCCAnalysisManager &AM, CGSCCUpdateResult &UR)
Initializers for usage outside of a CGSCC pass, inside a CGSCC pass in the old and new pass manager (...
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
@ ICMP_SLT
signed less than
static CondBrInst * Create(Value *Cond, BasicBlock *IfTrue, BasicBlock *IfFalse, InsertPosition InsertBefore=nullptr)
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
This is the shared class of boolean and integer constants.
IntegerType * getIntegerType() const
Variant of the getType() method to always return an IntegerType, which reduces the amount of casting ...
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
This is an important base class in LLVM.
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
ValueT lookup(const_arg_type_t< KeyT > Val) const
Return the entry for the specified key, or a default constructed value if no such entry exists.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
LLVM_ABI Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const
Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...
static ErrorSuccess success()
Create a success value.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this fence instruction.
A proxy from a FunctionAnalysisManager to an SCC.
const BasicBlock & getEntryBlock() const
const BasicBlock & front() const
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Argument * getArg(unsigned i) const
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
bool hasLocalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
@ PrivateLinkage
Like Internal, but omit from symbol table.
@ InternalLinkage
Rename collisions when linking (static functions).
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
LLVM_ABI void setInitializer(Constant *InitVal)
setInitializer - Sets the initializer for this global variable, removing any existing initializer if ...
BasicBlock * getBlock() const
CondBrInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateIsNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg == 0.
LLVM_ABI bool isLifetimeStartOrEnd() const LLVM_READONLY
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
LLVM_ABI bool mayReadFromMemory() const LLVM_READONLY
Return true if this instruction may read memory.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void setSuccessor(unsigned Idx, BasicBlock *BB)
Update the specified successor to point at the provided block.
A node in the call graph.
An SCC of the call graph.
A lazily constructed view of the call graph of a module.
LLVM_ABI void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
LLVM_ABI StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
A Module instance is used to store all the information related to an LLVM module.
const Triple & getTargetTriple() const
Get the target triple which is a string describing the target host.
LLVM_ABI Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
LLVM_ABI FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
static LLVM_ABI std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
LLVM_ABI Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
static LLVM_ABI std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
LLVM_ABI PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM, LazyCallGraph &CG, CGSCCUpdateResult &UR)
LLVM_ABI PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
static ReturnInst * Create(LLVMContext &C, Value *retVal=nullptr, InsertPosition InsertBefore=nullptr)
A vector that has set insertion semantics.
size_type size() const
Determine the number of elements in the SetVector.
size_type count(const_arg_type key) const
Count the number of elements of a given key in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Triple - Helper class for working with autoconf configuration names.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static UncondBrInst * Create(BasicBlock *Target, InsertPosition InsertBefore=nullptr)
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Type * getType() const
All values are typed, get the type of this value.
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
GlobalVariable * getKernelEnvironementGVFromKernelInitCB(CallBase *KernelInitCB)
ConstantStruct * getKernelEnvironementFromKernelInitCB(CallBase *KernelInitCB)
Abstract Attribute helper functions.
LLVM_ABI bool isValidAtPosition(const ValueAndContext &VAC, InformationCache &InfoCache)
Return true if the value of VAC is a valid at the position of VAC, that is a constant,...
LLVM_ABI bool isPotentiallyAffectedByBarrier(Attributor &A, const Instruction &I, const AbstractAttribute &QueryingAA)
Return true if I is potentially affected by a barrier.
LLVM_ABI bool isNoSyncInst(Attributor &A, const Instruction &I, const AbstractAttribute &QueryingAA)
Return true if I is a nosync instruction.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
E & operator^=(E &LHS, E RHS)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ BasicBlock
Various leaf nodes.
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
constexpr uint64_t PointerSize
aarch64 pointer size.
LLVM_ABI bool isOpenMPDevice(Module &M)
Helper to determine if M is a OpenMP target offloading device module.
LLVM_ABI bool containsOpenMP(Module &M)
Helper to determine if M contains OpenMP.
InternalControlVar
IDs for all Internal Control Variables (ICVs).
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
LLVM_ABI KernelSet getDeviceKernels(Module &M)
Get OpenMP device kernels in M.
@ OMP_TGT_EXEC_MODE_GENERIC_SPMD
@ OMP_TGT_EXEC_MODE_GENERIC
SetVector< Kernel > KernelSet
Set of kernels in the module.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
LLVM_ABI bool isOpenMPKernel(Function &Fn)
Return true iff Fn is an OpenMP GPU kernel; Fn has the "kernel" attribute.
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< UseNode * > Use
friend class Instruction
Iterator for Instructions in a `BasicBlock.
LLVM_ABI iterator begin() const
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool succ_empty(const Instruction *I)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool isRemovableAlloc(const CallBase *V, const TargetLibraryInfo *TLI)
Return true if this is a call to an allocation function that does not have side effects that we are r...
bool operator!=(uint64_t V1, const APInt &V2)
constexpr from_range_t from_range
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Value
AnalysisManager< LazyCallGraph::SCC, LazyCallGraph & > CGSCCAnalysisManager
The CGSCC analysis manager.
@ ThinLTOPostLink
ThinLTO postlink (backend compile) phase.
@ FullLTOPostLink
Full LTO postlink (backend compile) phase.
@ ThinLTOPreLink
ThinLTO prelink (summary) phase.
auto dyn_cast_or_null(const Y &Val)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI Value * getFreedOperand(const CallBase *CB, const TargetLibraryInfo *TLI)
If this if a call to a free function, return the freed operand.
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto predecessors(const MachineBasicBlock *BB)
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
Attempt to constant fold an insertvalue instruction with the specified operands and indices.
@ OPTIONAL
The target may be valid if the source is not.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
BumpPtrAllocatorImpl<> BumpPtrAllocator
The standard BumpPtrAllocator which just uses the default template parameters.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
static LLVM_ABI AAExecutionDomain & createForPosition(const IRPosition &IRP, Attributor &A)
Create an abstract attribute view for the position IRP.
AAExecutionDomain(const IRPosition &IRP, Attributor &A)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
AccessKind
Simple enum to distinguish read/write/read-write accesses.
StateType::base_t MemoryLocationsKind
static LLVM_ABI bool isAlignedBarrier(const CallBase &CB, bool ExecutedAligned)
Helper function to determine if CB is an aligned (GPU) barrier.
Base struct for all "concrete attribute" deductions.
virtual const char * getIdAddr() const =0
This function should return the address of the ID of the AbstractAttribute.
An interface to query the internal state of an abstract attribute.
Wrapper for FunctionAnalysisManager.
Configuration for the Attributor.
std::function< void(Attributor &A, const Function &F)> InitializationCallback
Callback function to be invoked on internal functions marked live.
std::optional< unsigned > MaxFixpointIterations
Maximum number of iterations to run until fixpoint.
bool RewriteSignatures
Flag to determine if we rewrite function signatures.
OptimizationRemarkGetter OREGetter
IPOAmendableCBTy IPOAmendableCB
bool IsModulePass
Is the user of the Attributor a module pass or not.
bool DefaultInitializeLiveInternals
Flag to determine if we want to initialize all default AAs for an internal function marked live.
The fixpoint analysis framework that orchestrates the attribute deduction.
static LLVM_ABI bool isInternalizable(Function &F)
Returns true if the function F can be internalized.
std::function< std::optional< Value * >( const IRPosition &, const AbstractAttribute *, bool &)> SimplifictionCallbackTy
Register CB as a simplification callback.
std::function< std::optional< Constant * >( const GlobalVariable &, const AbstractAttribute *, bool &)> GlobalVariableSimplifictionCallbackTy
Register CB as a simplification callback.
std::function< bool(Attributor &, const AbstractAttribute *)> VirtualUseCallbackTy
static LLVM_ABI bool internalizeFunctions(SmallPtrSetImpl< Function * > &FnSet, DenseMap< Function *, Function * > &FnMap)
Make copies of each function in the set FnSet such that the copied version has internal linkage after...
Simple wrapper for a single bit (boolean) state.
Support structure for SCC passes to communicate updates the call graph back to the CGSCC pass manager...
Helper to describe and deal with positions in the LLVM-IR.
static const IRPosition callsite_returned(const CallBase &CB)
Create a position describing the returned value of CB.
static const IRPosition returned(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the returned value of F.
static const IRPosition value(const Value &V, const CallBaseContext *CBContext=nullptr)
Create a position describing the value of V.
static const IRPosition inst(const Instruction &I, const CallBaseContext *CBContext=nullptr)
Create a position describing the instruction I.
@ IRP_ARGUMENT
An attribute for a function argument.
@ IRP_RETURNED
An attribute for the function return value.
@ IRP_CALL_SITE
An attribute for a call site (function scope).
@ IRP_CALL_SITE_RETURNED
An attribute for a call site return value.
@ IRP_FUNCTION
An attribute for a function (scope).
@ IRP_FLOAT
A position that is not associated with a spot suitable for attributes.
@ IRP_CALL_SITE_ARGUMENT
An attribute for a call site argument.
@ IRP_INVALID
An invalid position.
static const IRPosition function(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the function scope of F.
Kind getPositionKind() const
Return the associated position kind.
static const IRPosition callsite_function(const CallBase &CB)
Create a position describing the function scope of CB.
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...