50#include "llvm/IR/IntrinsicsAMDGPU.h"
51#include "llvm/IR/IntrinsicsNVPTX.h"
67#define DEBUG_TYPE "openmp-opt"
70 "openmp-opt-disable",
cl::desc(
"Disable OpenMP specific optimizations."),
74 "openmp-opt-enable-merging",
80 cl::desc(
"Disable function internalization."),
91 "openmp-hide-memory-transfer-latency",
92 cl::desc(
"[WIP] Tries to hide the latency of host to device memory"
97 "openmp-opt-disable-deglobalization",
98 cl::desc(
"Disable OpenMP optimizations involving deglobalization."),
102 "openmp-opt-disable-spmdization",
103 cl::desc(
"Disable OpenMP optimizations involving SPMD-ization."),
107 "openmp-opt-disable-folding",
112 "openmp-opt-disable-state-machine-rewrite",
113 cl::desc(
"Disable OpenMP optimizations that replace the state machine."),
117 "openmp-opt-disable-barrier-elimination",
118 cl::desc(
"Disable OpenMP optimizations that eliminate barriers."),
122 "openmp-opt-print-module-after",
123 cl::desc(
"Print the current module after OpenMP optimizations."),
127 "openmp-opt-print-module-before",
128 cl::desc(
"Print the current module before OpenMP optimizations."),
132 "openmp-opt-inline-device",
143 cl::desc(
"Maximal number of attributor iterations."),
148 cl::desc(
"Maximum amount of shared memory to use."),
149 cl::init(std::numeric_limits<unsigned>::max()));
152 "Number of OpenMP runtime calls deduplicated");
154 "Number of OpenMP parallel regions deleted");
156 "Number of OpenMP runtime functions identified");
158 "Number of OpenMP runtime function uses identified");
160 "Number of OpenMP target region entry points (=kernels) identified");
162 "Number of non-OpenMP target region kernels identified");
164 "Number of OpenMP target region entry points (=kernels) executed in "
165 "SPMD-mode instead of generic-mode");
166STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine,
167 "Number of OpenMP target region entry points (=kernels) executed in "
168 "generic-mode without a state machines");
169STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback,
170 "Number of OpenMP target region entry points (=kernels) executed in "
171 "generic-mode with customized state machines with fallback");
172STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback,
173 "Number of OpenMP target region entry points (=kernels) executed in "
174 "generic-mode with customized state machines without fallback");
176 NumOpenMPParallelRegionsReplacedInGPUStateMachine,
177 "Number of OpenMP parallel regions replaced with ID in GPU state machines");
179 "Number of OpenMP parallel regions merged");
181 "Amount of memory pushed to shared memory");
182STATISTIC(NumBarriersEliminated,
"Number of redundant barriers eliminated");
210#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX) \
211 constexpr unsigned MEMBER##Idx = IDX;
216#undef KERNEL_ENVIRONMENT_IDX
218#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX) \
219 constexpr unsigned MEMBER##Idx = IDX;
229#undef KERNEL_ENVIRONMENT_CONFIGURATION_IDX
231#define KERNEL_ENVIRONMENT_GETTER(MEMBER, RETURNTYPE) \
232 RETURNTYPE *get##MEMBER##FromKernelEnvironment(ConstantStruct *KernelEnvC) { \
233 return cast<RETURNTYPE>(KernelEnvC->getAggregateElement(MEMBER##Idx)); \
239#undef KERNEL_ENVIRONMENT_GETTER
241#define KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MEMBER) \
242 ConstantInt *get##MEMBER##FromKernelEnvironment( \
243 ConstantStruct *KernelEnvC) { \
244 ConstantStruct *ConfigC = \
245 getConfigurationFromKernelEnvironment(KernelEnvC); \
246 return dyn_cast<ConstantInt>(ConfigC->getAggregateElement(MEMBER##Idx)); \
257#undef KERNEL_ENVIRONMENT_CONFIGURATION_GETTER
261 constexpr int InitKernelEnvironmentArgNo = 0;
276struct AAHeapToShared;
283 OMPInformationCache(
Module &M, AnalysisGetter &AG,
287 OpenMPPostLink(OpenMPPostLink) {
290 const Triple
T(OMPBuilder.M.getTargetTriple());
291 switch (
T.getArch()) {
295 assert(OMPBuilder.Config.IsTargetDevice &&
296 "OpenMP AMDGPU/NVPTX is only prepared to deal with device code.");
297 OMPBuilder.Config.IsGPU =
true;
300 OMPBuilder.Config.IsGPU =
false;
303 OMPBuilder.initialize();
304 initializeRuntimeFunctions(M);
305 initializeInternalControlVars();
309 struct InternalControlVarInfo {
317 StringRef EnvVarName;
323 ConstantInt *InitValue;
336 struct RuntimeFunctionInfo {
357 using UseVector = SmallVector<Use *, 16>;
360 void clearUsesMap() { UsesMap.clear(); }
363 operator bool()
const {
return Declaration; }
366 UseVector &getOrCreateUseVector(Function *
F) {
367 std::shared_ptr<UseVector> &UV = UsesMap[
F];
369 UV = std::make_shared<UseVector>();
375 const UseVector *getUseVector(Function &
F)
const {
376 auto I = UsesMap.find(&
F);
377 if (
I != UsesMap.end())
378 return I->second.get();
383 size_t getNumFunctionsWithUses()
const {
return UsesMap.size(); }
387 size_t getNumArgs()
const {
return ArgumentTypes.size(); }
392 void foreachUse(SmallVectorImpl<Function *> &SCC,
393 function_ref<
bool(Use &, Function &)> CB) {
394 for (Function *
F : SCC)
400 void foreachUse(function_ref<
bool(Use &, Function &)> CB, Function *
F) {
401 SmallVector<unsigned, 8> ToBeDeleted;
405 UseVector &UV = getOrCreateUseVector(
F);
415 while (!ToBeDeleted.
empty()) {
425 DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap;
429 decltype(UsesMap)::iterator
begin() {
return UsesMap.begin(); }
430 decltype(UsesMap)::iterator
end() {
return UsesMap.end(); }
434 OpenMPIRBuilder OMPBuilder;
438 RuntimeFunction::OMPRTL___last>
442 DenseMap<Function *, RuntimeFunction> RuntimeFunctionIDMap;
446 InternalControlVar::ICV___last>
451 void initializeInternalControlVars() {
452#define ICV_RT_SET(_Name, RTL) \
454 auto &ICV = ICVs[_Name]; \
457#define ICV_RT_GET(Name, RTL) \
459 auto &ICV = ICVs[Name]; \
462#define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \
464 auto &ICV = ICVs[Enum]; \
467 ICV.InitKind = Init; \
468 ICV.EnvVarName = _EnvVarName; \
469 switch (ICV.InitKind) { \
470 case ICV_IMPLEMENTATION_DEFINED: \
471 ICV.InitValue = nullptr; \
474 ICV.InitValue = ConstantInt::get( \
475 Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \
478 ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \
484#include "llvm/Frontend/OpenMP/OMPKinds.def"
490 static bool declMatchesRTFTypes(Function *
F,
Type *RTFRetType,
497 if (
F->getReturnType() != RTFRetType)
499 if (
F->arg_size() != RTFArgTypes.
size())
502 auto *RTFTyIt = RTFArgTypes.
begin();
503 for (Argument &Arg :
F->args()) {
504 if (Arg.getType() != *RTFTyIt)
514 unsigned collectUses(RuntimeFunctionInfo &RFI,
bool CollectStats =
true) {
515 unsigned NumUses = 0;
516 if (!RFI.Declaration)
518 OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration);
521 NumOpenMPRuntimeFunctionsIdentified += 1;
522 NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
526 for (Use &U : RFI.Declaration->uses()) {
528 if (!
CGSCC ||
CGSCC->empty() ||
CGSCC->contains(UserI->getFunction())) {
529 RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
533 RFI.getOrCreateUseVector(
nullptr).push_back(&U);
542 auto &RFI = RFIs[RTF];
544 collectUses(RFI,
false);
548 void recollectUses() {
549 for (
int Idx = 0; Idx < RFIs.size(); ++Idx)
554 void setCallingConvention(FunctionCallee Callee, CallInst *CI) {
569 RuntimeFunctionInfo &RFI = RFIs[Fn];
571 if (!RFI.Declaration || RFI.Declaration->isDeclaration())
579 void initializeRuntimeFunctions(
Module &M) {
582#define OMP_TYPE(VarName, ...) \
583 Type *VarName = OMPBuilder.VarName; \
586#define OMP_ARRAY_TYPE(VarName, ...) \
587 ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \
589 PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \
590 (void)VarName##PtrTy;
592#define OMP_FUNCTION_TYPE(VarName, ...) \
593 FunctionType *VarName = OMPBuilder.VarName; \
595 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
598#define OMP_STRUCT_TYPE(VarName, ...) \
599 StructType *VarName = OMPBuilder.VarName; \
601 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
604#define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \
606 SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \
607 Function *F = M.getFunction(_Name); \
608 RTLFunctions.insert(F); \
609 if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \
610 RuntimeFunctionIDMap[F] = _Enum; \
611 auto &RFI = RFIs[_Enum]; \
614 RFI.IsVarArg = _IsVarArg; \
615 RFI.ReturnType = OMPBuilder._ReturnType; \
616 RFI.ArgumentTypes = std::move(ArgsTypes); \
617 RFI.Declaration = F; \
618 unsigned NumUses = collectUses(RFI); \
621 dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \
623 if (RFI.Declaration) \
624 dbgs() << TAG << "-> got " << NumUses << " uses in " \
625 << RFI.getNumFunctionsWithUses() \
626 << " different functions.\n"; \
630#include "llvm/Frontend/OpenMP/OMPKinds.def"
635 for (Function &
F : M) {
636 for (StringRef Prefix : {
"__kmpc",
"_ZN4ompx",
"omp_"})
637 if (
F.hasFnAttribute(Attribute::NoInline) &&
638 F.getName().starts_with(Prefix) &&
639 !
F.hasFnAttribute(Attribute::OptimizeNone))
640 F.removeFnAttr(Attribute::NoInline);
648 DenseSet<const Function *> RTLFunctions;
651 bool OpenMPPostLink =
false;
654template <
typename Ty,
bool InsertInval
idates = true>
656 bool contains(
const Ty &Elem)
const {
return Set.contains(Elem); }
657 bool insert(
const Ty &Elem) {
658 if (InsertInvalidates)
659 BooleanState::indicatePessimisticFixpoint();
660 return Set.insert(Elem);
663 const Ty &operator[](
int Idx)
const {
return Set[Idx]; }
664 bool operator==(
const BooleanStateWithSetVector &
RHS)
const {
665 return BooleanState::operator==(
RHS) && Set ==
RHS.Set;
667 bool operator!=(
const BooleanStateWithSetVector &
RHS)
const {
668 return !(*
this ==
RHS);
671 bool empty()
const {
return Set.empty(); }
672 size_t size()
const {
return Set.size(); }
675 BooleanStateWithSetVector &
operator^=(
const BooleanStateWithSetVector &
RHS) {
676 BooleanState::operator^=(
RHS);
677 Set.insert_range(
RHS.Set);
686 typename decltype(Set)::iterator
begin() {
return Set.begin(); }
687 typename decltype(Set)::iterator
end() {
return Set.end(); }
688 typename decltype(Set)::const_iterator
begin()
const {
return Set.begin(); }
689 typename decltype(Set)::const_iterator
end()
const {
return Set.end(); }
692template <
typename Ty,
bool InsertInval
idates = true>
693using BooleanStateWithPtrSetVector =
694 BooleanStateWithSetVector<Ty *, InsertInvalidates>;
698 bool IsAtFixpoint =
false;
702 BooleanStateWithPtrSetVector<CallBase,
false>
703 ReachedKnownParallelRegions;
706 BooleanStateWithPtrSetVector<CallBase> ReachedUnknownParallelRegions;
711 BooleanStateWithPtrSetVector<Instruction, false> SPMDCompatibilityTracker;
715 CallBase *KernelInitCB =
nullptr;
719 ConstantStruct *KernelEnvC =
nullptr;
723 CallBase *KernelDeinitCB =
nullptr;
726 bool IsKernelEntry =
false;
729 BooleanStateWithPtrSetVector<Function, false> ReachingKernelEntries;
734 BooleanStateWithSetVector<uint8_t> ParallelLevels;
737 bool NestedParallelism =
false;
742 KernelInfoState() =
default;
743 KernelInfoState(
bool BestState) {
745 indicatePessimisticFixpoint();
749 bool isValidState()
const override {
return true; }
752 bool isAtFixpoint()
const override {
return IsAtFixpoint; }
757 ParallelLevels.indicatePessimisticFixpoint();
758 ReachingKernelEntries.indicatePessimisticFixpoint();
759 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
760 ReachedKnownParallelRegions.indicatePessimisticFixpoint();
761 ReachedUnknownParallelRegions.indicatePessimisticFixpoint();
762 NestedParallelism =
true;
763 return ChangeStatus::CHANGED;
769 ParallelLevels.indicateOptimisticFixpoint();
770 ReachingKernelEntries.indicateOptimisticFixpoint();
771 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
772 ReachedKnownParallelRegions.indicateOptimisticFixpoint();
773 ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
774 return ChangeStatus::UNCHANGED;
778 KernelInfoState &getAssumed() {
return *
this; }
779 const KernelInfoState &getAssumed()
const {
return *
this; }
782 if (SPMDCompatibilityTracker !=
RHS.SPMDCompatibilityTracker)
784 if (ReachedKnownParallelRegions !=
RHS.ReachedKnownParallelRegions)
786 if (ReachedUnknownParallelRegions !=
RHS.ReachedUnknownParallelRegions)
788 if (ReachingKernelEntries !=
RHS.ReachingKernelEntries)
790 if (ParallelLevels !=
RHS.ParallelLevels)
792 if (NestedParallelism !=
RHS.NestedParallelism)
798 bool mayContainParallelRegion() {
799 return !ReachedKnownParallelRegions.empty() ||
800 !ReachedUnknownParallelRegions.empty();
804 static KernelInfoState getBestState() {
return KernelInfoState(
true); }
806 static KernelInfoState getBestState(KernelInfoState &KIS) {
807 return getBestState();
811 static KernelInfoState getWorstState() {
return KernelInfoState(
false); }
814 KernelInfoState
operator^=(
const KernelInfoState &KIS) {
816 if (KIS.KernelInitCB) {
817 if (KernelInitCB && KernelInitCB != KIS.KernelInitCB)
820 KernelInitCB = KIS.KernelInitCB;
822 if (KIS.KernelDeinitCB) {
823 if (KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB)
826 KernelDeinitCB = KIS.KernelDeinitCB;
828 if (KIS.KernelEnvC) {
829 if (KernelEnvC && KernelEnvC != KIS.KernelEnvC)
832 KernelEnvC = KIS.KernelEnvC;
834 SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;
835 ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;
836 ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;
837 NestedParallelism |= KIS.NestedParallelism;
841 KernelInfoState
operator&=(
const KernelInfoState &KIS) {
842 return (*
this ^= KIS);
852 AllocaInst *Array =
nullptr;
854 SmallVector<Value *, 8> StoredValues;
856 SmallVector<StoreInst *, 8> LastAccesses;
858 OffloadArray() =
default;
864 bool initialize(AllocaInst &Array, Instruction &Before) {
865 if (!getValues(Array, Before))
868 this->Array = &Array;
872 static const unsigned DeviceIDArgNum = 1;
873 static const unsigned BasePtrsArgNum = 3;
874 static const unsigned PtrsArgNum = 4;
875 static const unsigned SizesArgNum = 5;
881 bool getValues(AllocaInst &Array, Instruction &Before) {
883 const DataLayout &
DL = Array.getDataLayout();
884 std::optional<TypeSize> ArraySize = Array.getAllocationSize(
DL);
885 if (!ArraySize || !ArraySize->isFixed())
888 const uint64_t NumValues = ArraySize->getFixedValue() /
PointerSize;
889 StoredValues.assign(NumValues,
nullptr);
890 LastAccesses.assign(NumValues,
nullptr);
898 for (Instruction &
I : *BB) {
912 if ((uint64_t)Idx < NumValues) {
914 LastAccesses[Idx] = S;
925 const unsigned NumValues = StoredValues.size();
926 for (
unsigned I = 0;
I < NumValues; ++
I) {
927 if (!StoredValues[
I] || !LastAccesses[
I])
937 using OptimizationRemarkGetter =
938 function_ref<OptimizationRemarkEmitter &(
Function *)>;
940 OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater,
941 OptimizationRemarkGetter OREGetter,
942 OMPInformationCache &OMPInfoCache, Attributor &A)
943 : M(*(*SCC.
begin())->
getParent()), SCC(SCC), CGUpdater(CGUpdater),
944 OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {}
947 bool remarksEnabled() {
948 auto &Ctx = M.getContext();
949 return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(
DEBUG_TYPE);
953 bool run(
bool IsModulePass) {
963 Changed |= runAttributor(IsModulePass);
966 OMPInfoCache.recollectUses();
969 Changed |= rewriteDeviceCodeStateMachine();
971 if (remarksEnabled())
972 analysisGlobalization();
979 Changed |= runAttributor(IsModulePass);
982 OMPInfoCache.recollectUses();
984 Changed |= deleteParallelRegions();
987 Changed |= hideMemTransfersLatency();
988 Changed |= deduplicateRuntimeCalls();
990 if (mergeParallelRegions()) {
991 deduplicateRuntimeCalls();
997 if (OMPInfoCache.OpenMPPostLink)
998 Changed |= removeRuntimeSymbols();
1005 void printICVs()
const {
1009 for (Function *
F : SCC) {
1010 for (
auto ICV : ICVs) {
1011 auto ICVInfo = OMPInfoCache.ICVs[ICV];
1012 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
1013 return ORA <<
"OpenMP ICV " <<
ore::NV(
"OpenMPICV", ICVInfo.Name)
1015 << (ICVInfo.InitValue
1016 ?
toString(ICVInfo.InitValue->getValue(), 10,
true)
1017 :
"IMPLEMENTATION_DEFINED");
1026 void printKernels()
const {
1027 for (Function *
F : SCC) {
1031 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
1032 return ORA <<
"OpenMP GPU kernel "
1033 <<
ore::NV(
"OpenMPGPUKernel",
F->getName()) <<
"\n";
1042 static CallInst *getCallIfRegularCall(
1043 Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI =
nullptr) {
1054 static CallInst *getCallIfRegularCall(
1055 Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI =
nullptr) {
1066 bool mergeParallelRegions() {
1067 const unsigned CallbackCalleeOperand = 2;
1068 const unsigned CallbackFirstArgOperand = 3;
1072 OMPInformationCache::RuntimeFunctionInfo &RFI =
1073 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1075 if (!RFI.Declaration)
1079 OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
1080 OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],
1081 OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],
1085 LoopInfo *LI =
nullptr;
1086 DominatorTree *DT =
nullptr;
1088 SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap;
1090 BasicBlock *StartBB =
nullptr, *EndBB =
nullptr;
1091 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
1092 BasicBlock *CGStartBB = CodeGenIP.getBlock();
1094 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
1095 assert(StartBB !=
nullptr &&
"StartBB should not be null");
1097 assert(EndBB !=
nullptr &&
"EndBB should not be null");
1098 EndBB->getTerminator()->setSuccessor(0, CGEndBB);
1102 auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
Value &,
1103 Value &Inner,
Value *&ReplacementValue) -> InsertPointTy {
1104 ReplacementValue = &Inner;
1108 auto FiniCB = [&](InsertPointTy CodeGenIP) {
return Error::success(); };
1112 auto CreateSequentialRegion = [&](
Function *OuterFn,
1118 BasicBlock *ParentBB = SeqStartI->getParent();
1120 SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);
1124 SplitBlock(ParentBB, SeqStartI, DT, LI,
nullptr,
"seq.par.merged");
1127 "Expected a different CFG");
1131 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
1132 BasicBlock *CGStartBB = CodeGenIP.getBlock();
1134 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
1135 assert(SeqStartBB !=
nullptr &&
"SeqStartBB should not be null");
1137 assert(SeqEndBB !=
nullptr &&
"SeqEndBB should not be null");
1141 auto FiniCB = [&](InsertPointTy CodeGenIP) {
return Error::success(); };
1145 for (Instruction &
I : *SeqStartBB) {
1146 SmallPtrSet<Instruction *, 4> OutsideUsers;
1147 for (User *Usr :
I.users()) {
1155 OutsideUsers.
insert(&UsrI);
1158 if (OutsideUsers.
empty())
1163 const DataLayout &
DL = M.getDataLayout();
1164 AllocaInst *AllocaI =
new AllocaInst(
1165 I.getType(),
DL.getAllocaAddrSpace(),
nullptr,
1170 new StoreInst(&
I, AllocaI, SeqStartBB->getTerminator()->getIterator());
1174 for (Instruction *UsrI : OutsideUsers) {
1175 LoadInst *LoadI =
new LoadInst(
I.getType(), AllocaI,
1176 I.getName() +
".seq.output.load",
1182 OpenMPIRBuilder::LocationDescription Loc(
1183 InsertPointTy(ParentBB, ParentBB->
end()),
DL);
1185 OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB));
1187 OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel));
1202 auto Merge = [&](
const SmallVectorImpl<CallInst *> &MergableCIs,
1206 assert(MergableCIs.
size() > 1 &&
"Assumed multiple mergable CIs");
1208 auto Remark = [&](OptimizationRemark
OR) {
1209 OR <<
"Parallel region merged with parallel region"
1210 << (MergableCIs.
size() > 2 ?
"s" :
"") <<
" at ";
1213 if (CI != MergableCIs.
back())
1221 Function *OriginalFn = BB->getParent();
1223 <<
" parallel regions in " << OriginalFn->
getName()
1227 EndBB =
SplitBlock(BB, MergableCIs.
back()->getNextNode(), DT, LI);
1229 SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
1233 assert(BB->getUniqueSuccessor() == StartBB &&
"Expected a different CFG");
1234 const DebugLoc DL = BB->getTerminator()->getDebugLoc();
1239 for (
auto *It = MergableCIs.
begin(), *End = MergableCIs.
end() - 1;
1248 CreateSequentialRegion(OriginalFn, BB, ForkCI->
getNextNode(),
1252 OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),
1254 IRBuilder<>::InsertPoint AllocaIP(
1260 cantFail(OMPInfoCache.OMPBuilder.createParallel(
1261 Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB,
nullptr,
nullptr,
1262 OMP_PROC_BIND_default,
false));
1266 OMPInfoCache.OMPBuilder.finalize(OriginalFn);
1272 SmallVector<Value *, 8>
Args;
1273 for (
auto *CI : MergableCIs) {
1275 FunctionType *FT = OMPInfoCache.OMPBuilder.ParallelTask;
1279 for (
unsigned U = CallbackFirstArgOperand,
E = CI->
arg_size(); U <
E;
1289 for (
unsigned U = CallbackFirstArgOperand,
E = CI->
arg_size(); U <
E;
1293 U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
1296 if (CI != MergableCIs.back()) {
1299 cantFail(OMPInfoCache.OMPBuilder.createBarrier(
1308 assert(OutlinedFn != OriginalFn &&
"Outlining failed");
1309 CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn);
1310 CGUpdater.reanalyzeFunction(*OriginalFn);
1312 NumOpenMPParallelRegionsMerged += MergableCIs.size();
1320 CallInst *CI = getCallIfRegularCall(U, &RFI);
1327 RFI.foreachUse(SCC, DetectPRsCB);
1333 for (
auto &It : BB2PRMap) {
1334 auto &CIs = It.getSecond();
1349 auto IsMergable = [&](
Instruction &
I,
bool IsBeforeMergableRegion) {
1352 if (
I.isTerminator())
1359 if (IsBeforeMergableRegion) {
1361 if (!CalledFunction)
1368 for (
const auto &RFI : UnmergableCallsInfo) {
1369 if (CalledFunction == RFI.Declaration)
1384 for (
auto It = BB->
begin(), End = BB->
end(); It != End;) {
1388 if (CIs.count(&
I)) {
1394 if (IsMergable(
I, MergableCIs.
empty()))
1399 for (; It != End; ++It) {
1401 if (CIs.count(&SkipI)) {
1403 <<
" due to " <<
I <<
"\n");
1410 if (MergableCIs.
size() > 1) {
1411 MergableCIsVector.
push_back(MergableCIs);
1413 <<
" parallel regions in block " << BB->
getName()
1418 MergableCIs.
clear();
1421 if (!MergableCIsVector.
empty()) {
1424 for (
auto &MergableCIs : MergableCIsVector)
1425 Merge(MergableCIs, BB);
1426 MergableCIsVector.clear();
1433 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call);
1434 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier);
1435 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master);
1436 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master);
1443 bool deleteParallelRegions() {
1444 const unsigned CallbackCalleeOperand = 2;
1446 OMPInformationCache::RuntimeFunctionInfo &RFI =
1447 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1449 if (!RFI.Declaration)
1454 CallInst *CI = getCallIfRegularCall(U);
1461 if (!Fn->onlyReadsMemory())
1463 if (!Fn->hasFnAttribute(Attribute::WillReturn))
1469 auto Remark = [&](OptimizationRemark
OR) {
1470 return OR <<
"Removing parallel region with no side-effects.";
1476 ++NumOpenMPParallelRegionsDeleted;
1480 RFI.foreachUse(SCC, DeleteCallCB);
1486 bool deduplicateRuntimeCalls() {
1490 OMPRTL_omp_get_num_threads,
1491 OMPRTL_omp_in_parallel,
1492 OMPRTL_omp_get_cancellation,
1493 OMPRTL_omp_get_supported_active_levels,
1494 OMPRTL_omp_get_level,
1495 OMPRTL_omp_get_ancestor_thread_num,
1496 OMPRTL_omp_get_team_size,
1497 OMPRTL_omp_get_active_level,
1498 OMPRTL_omp_in_final,
1499 OMPRTL_omp_get_proc_bind,
1500 OMPRTL_omp_get_num_places,
1501 OMPRTL_omp_get_num_procs,
1502 OMPRTL_omp_get_place_num,
1503 OMPRTL_omp_get_partition_num_places,
1504 OMPRTL_omp_get_partition_place_nums};
1507 SmallSetVector<Value *, 16> GTIdArgs;
1508 collectGlobalThreadIdArguments(GTIdArgs);
1510 <<
" global thread ID arguments\n");
1512 for (Function *
F : SCC) {
1513 for (
auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
1514 Changed |= deduplicateRuntimeCalls(
1515 *
F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
1519 Value *GTIdArg =
nullptr;
1520 for (Argument &Arg :
F->args())
1521 if (GTIdArgs.
count(&Arg)) {
1525 Changed |= deduplicateRuntimeCalls(
1526 *
F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);
1533 bool removeRuntimeSymbols() {
1538 if (GlobalVariable *GV = M.getNamedGlobal(
"__llvm_rpc_client")) {
1539 if (GV->hasNUsesOrMore(1))
1543 GV->eraseFromParent();
1555 bool hideMemTransfersLatency() {
1556 auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
1559 auto *RTCall = getCallIfRegularCall(U, &RFI);
1563 OffloadArray OffloadArrays[3];
1564 if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
1567 LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));
1570 bool WasSplit =
false;
1571 Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
1572 if (WaitMovementPoint)
1573 WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);
1578 if (OMPInfoCache.runtimeFnsAvailable(
1579 {OMPRTL___tgt_target_data_begin_mapper_issue,
1580 OMPRTL___tgt_target_data_begin_mapper_wait}))
1581 RFI.foreachUse(SCC, SplitMemTransfers);
1586 void analysisGlobalization() {
1587 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
1589 auto CheckGlobalization = [&](
Use &
U,
Function &Decl) {
1590 if (CallInst *CI = getCallIfRegularCall(U, &RFI)) {
1591 auto Remark = [&](OptimizationRemarkMissed ORM) {
1593 <<
"Found thread data sharing on the GPU. "
1594 <<
"Expect degraded performance due to data globalization.";
1602 RFI.foreachUse(SCC, CheckGlobalization);
1607 bool getValuesInOffloadArrays(CallInst &RuntimeCall,
1609 assert(OAs.
size() == 3 &&
"Need space for three offload arrays!");
1619 Value *BasePtrsArg =
1631 if (!OAs[0].
initialize(*BasePtrsArray, RuntimeCall))
1639 if (!OAs[1].
initialize(*PtrsArray, RuntimeCall))
1651 if (!OAs[2].
initialize(*SizesArray, RuntimeCall))
1662 assert(OAs.
size() == 3 &&
"There are three offload arrays to debug!");
1665 std::string ValuesStr;
1666 raw_string_ostream
Printer(ValuesStr);
1667 std::string Separator =
" --- ";
1669 for (
auto *BP : OAs[0].StoredValues) {
1673 LLVM_DEBUG(
dbgs() <<
"\t\toffload_baseptrs: " << ValuesStr <<
"\n");
1676 for (
auto *
P : OAs[1].StoredValues) {
1683 for (
auto *S : OAs[2].StoredValues) {
1687 LLVM_DEBUG(
dbgs() <<
"\t\toffload_sizes: " << ValuesStr <<
"\n");
1692 Instruction *canBeMovedDownwards(CallInst &RuntimeCall) {
1697 bool IsWorthIt =
false;
1716 return RuntimeCall.
getParent()->getTerminator();
1720 bool splitTargetDataBeginRTC(CallInst &RuntimeCall,
1721 Instruction &WaitMovementPoint) {
1725 auto &
IRBuilder = OMPInfoCache.OMPBuilder;
1728 IRBuilder.Builder.SetInsertPoint(&Entry,
1729 Entry.getFirstNonPHIOrDbgOrAlloca());
1731 IRBuilder.AsyncInfo,
nullptr,
"handle");
1738 FunctionCallee IssueDecl =
IRBuilder.getOrCreateRuntimeFunction(
1739 M, OMPRTL___tgt_target_data_begin_mapper_issue);
1742 SmallVector<Value *, 16>
Args;
1743 for (
auto &Arg : RuntimeCall.
args())
1744 Args.push_back(Arg.get());
1745 Args.push_back(Handle);
1749 OMPInfoCache.setCallingConvention(IssueDecl, IssueCallsite);
1754 FunctionCallee WaitDecl =
IRBuilder.getOrCreateRuntimeFunction(
1755 M, OMPRTL___tgt_target_data_begin_mapper_wait);
1757 Value *WaitParams[2] = {
1759 OffloadArray::DeviceIDArgNum),
1763 WaitDecl, WaitParams,
"", WaitMovementPoint.
getIterator());
1764 OMPInfoCache.setCallingConvention(WaitDecl, WaitCallsite);
1769 static Value *combinedIdentStruct(
Value *CurrentIdent,
Value *NextIdent,
1770 bool GlobalOnly,
bool &SingleChoice) {
1771 if (CurrentIdent == NextIdent)
1772 return CurrentIdent;
1777 SingleChoice = !CurrentIdent;
1789 getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
1790 Function &
F,
bool GlobalOnly) {
1791 bool SingleChoice =
true;
1792 Value *Ident =
nullptr;
1794 CallInst *CI = getCallIfRegularCall(U, &RFI);
1795 if (!CI || &
F != &Caller)
1798 true, SingleChoice);
1801 RFI.foreachUse(SCC, CombineIdentStruct);
1803 if (!Ident || !SingleChoice) {
1806 if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
1808 &
F.getEntryBlock(),
F.getEntryBlock().begin()));
1811 uint32_t SrcLocStrSize;
1813 OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1814 Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc, SrcLocStrSize);
1821 bool deduplicateRuntimeCalls(Function &
F,
1822 OMPInformationCache::RuntimeFunctionInfo &RFI,
1823 Value *ReplVal =
nullptr) {
1824 auto *UV = RFI.getUseVector(
F);
1825 if (!UV || UV->size() + (ReplVal !=
nullptr) < 2)
1829 dbgs() <<
TAG <<
"Deduplicate " << UV->size() <<
" uses of " << RFI.Name
1830 << (ReplVal ?
" with an existing value\n" :
"\n") <<
"\n");
1834 "Unexpected replacement value!");
1837 auto CanBeMoved = [
this](CallBase &CB) {
1838 unsigned NumArgs = CB.arg_size();
1841 if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
1843 for (
unsigned U = 1;
U < NumArgs; ++
U)
1851 OMPInfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(
F);
1855 for (Use *U : *UV) {
1856 if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
1861 if (!CanBeMoved(*CI))
1869 assert(IP &&
"Expected insertion point!");
1879 Value *Ident = getCombinedIdentFromCallUsesIn(RFI,
F,
1887 CallInst *CI = getCallIfRegularCall(U, &RFI);
1888 if (!CI || CI == ReplVal || &
F != &Caller)
1892 auto Remark = [&](OptimizationRemark
OR) {
1893 return OR <<
"OpenMP runtime call "
1894 <<
ore::NV(
"OpenMPOptRuntime", RFI.Name) <<
" deduplicated.";
1903 ++NumOpenMPRuntimeCallsDeduplicated;
1907 RFI.foreachUse(SCC, ReplaceAndDeleteCB);
1913 void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> >IdArgs) {
1920 auto CallArgOpIsGTId = [&](
Function &
F,
unsigned ArgNo, CallInst &RefCI) {
1921 if (!
F.hasLocalLinkage())
1923 for (Use &U :
F.uses()) {
1924 if (CallInst *CI = getCallIfRegularCall(U)) {
1926 if (CI == &RefCI || GTIdArgs.
count(ArgOp) ||
1927 getCallIfRegularCall(
1928 *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))
1937 auto AddUserArgs = [&](
Value >Id) {
1938 for (Use &U : GTId.uses())
1942 if (CallArgOpIsGTId(*Callee,
U.getOperandNo(), *CI))
1947 OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
1948 OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];
1950 GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &
F) {
1951 if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))
1959 for (
unsigned U = 0;
U < GTIdArgs.
size(); ++
U)
1960 AddUserArgs(*GTIdArgs[U]);
1968 DenseMap<Function *, std::optional<Kernel>> UniqueKernelMap;
1971 Kernel getUniqueKernelFor(Function &
F);
1974 Kernel getUniqueKernelFor(Instruction &
I) {
1975 return getUniqueKernelFor(*
I.getFunction());
1980 bool rewriteDeviceCodeStateMachine();
1996 template <
typename RemarkKind,
typename RemarkCallBack>
1997 void emitRemark(Instruction *
I, StringRef RemarkName,
1998 RemarkCallBack &&RemarkCB)
const {
2000 auto &ORE = OREGetter(
F);
2004 return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
I))
2005 <<
" [" << RemarkName <<
"]";
2009 [&]() {
return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
I)); });
2013 template <
typename RemarkKind,
typename RemarkCallBack>
2014 void emitRemark(Function *
F, StringRef RemarkName,
2015 RemarkCallBack &&RemarkCB)
const {
2016 auto &ORE = OREGetter(
F);
2020 return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
F))
2021 <<
" [" << RemarkName <<
"]";
2025 [&]() {
return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
F)); });
2032 SmallVectorImpl<Function *> &SCC;
2036 CallGraphUpdater &CGUpdater;
2039 OptimizationRemarkGetter OREGetter;
2042 OMPInformationCache &OMPInfoCache;
2048 bool runAttributor(
bool IsModulePass) {
2052 registerAAs(IsModulePass);
2057 <<
" functions, result: " <<
Changed <<
".\n");
2059 if (
Changed == ChangeStatus::CHANGED)
2060 OMPInfoCache.invalidateAnalyses();
2062 return Changed == ChangeStatus::CHANGED;
2069 void registerAAs(
bool IsModulePass);
2074 static void registerAAsForFunction(Attributor &A,
const Function &
F);
2078 if (OMPInfoCache.CGSCC && !OMPInfoCache.CGSCC->empty() &&
2079 !OMPInfoCache.CGSCC->contains(&
F))
2084 std::optional<Kernel> &CachedKernel = UniqueKernelMap[&
F];
2086 return *CachedKernel;
2093 return *CachedKernel;
2096 CachedKernel =
nullptr;
2097 if (!
F.hasLocalLinkage()) {
2100 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2101 return ORA <<
"Potentially unknown OpenMP target region caller.";
2109 auto GetUniqueKernelForUse = [&](
const Use &
U) ->
Kernel {
2112 if (
Cmp->isEquality())
2113 return getUniqueKernelFor(*Cmp);
2118 if (CB->isCallee(&U))
2119 return getUniqueKernelFor(*CB);
2121 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
2122 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_60];
2124 if (OpenMPOpt::getCallIfRegularCall(*
U.getUser(), &KernelParallelRFI))
2125 return getUniqueKernelFor(*CB);
2133 SmallPtrSet<Kernel, 2> PotentialKernels;
2134 OMPInformationCache::foreachUse(
F, [&](
const Use &U) {
2135 PotentialKernels.
insert(GetUniqueKernelForUse(U));
2139 if (PotentialKernels.
size() == 1)
2140 K = *PotentialKernels.
begin();
2143 UniqueKernelMap[&
F] =
K;
2148bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
2149 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
2150 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_60];
2153 if (!KernelParallelRFI)
2160 for (Function *
F : SCC) {
2164 bool UnknownUse =
false;
2165 bool KernelParallelUse =
false;
2166 unsigned NumDirectCalls = 0;
2169 OMPInformationCache::foreachUse(*
F, [&](Use &U) {
2171 if (CB->isCallee(&U)) {
2177 ToBeReplacedStateMachineUses.
push_back(&U);
2183 OpenMPOpt::getCallIfRegularCall(*
U.getUser(), &KernelParallelRFI);
2184 const unsigned int WrapperFunctionArgNo = 6;
2185 if (!KernelParallelUse && CI &&
2187 KernelParallelUse =
true;
2188 ToBeReplacedStateMachineUses.
push_back(&U);
2196 if (!KernelParallelUse)
2202 if (UnknownUse || NumDirectCalls != 1 ||
2203 ToBeReplacedStateMachineUses.
size() > 2) {
2204 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2205 return ORA <<
"Parallel region is used in "
2206 << (UnknownUse ?
"unknown" :
"unexpected")
2207 <<
" ways. Will not attempt to rewrite the state machine.";
2217 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2218 return ORA <<
"Parallel region is not called from a unique kernel. "
2219 "Will not attempt to rewrite the state machine.";
2231 Type *Int8Ty = Type::getInt8Ty(
M.getContext());
2233 auto *
ID =
new GlobalVariable(
2237 for (Use *U : ToBeReplacedStateMachineUses)
2239 ID,
U->get()->getType()));
2241 ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
2250struct AAICVTracker :
public StateWrapper<BooleanState, AbstractAttribute> {
2251 using Base = StateWrapper<BooleanState, AbstractAttribute>;
2252 AAICVTracker(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
2255 bool isAssumedTracked()
const {
return getAssumed(); }
2258 bool isKnownTracked()
const {
return getAssumed(); }
2261 static AAICVTracker &createForPosition(
const IRPosition &IRP, Attributor &
A);
2265 const Instruction *
I,
2266 Attributor &
A)
const {
2267 return std::nullopt;
2273 virtual std::optional<Value *>
2281 StringRef
getName()
const override {
return "AAICVTracker"; }
2284 const char *getIdAddr()
const override {
return &
ID; }
2287 static bool classof(
const AbstractAttribute *AA) {
2291 static const char ID;
2294struct AAICVTrackerFunction :
public AAICVTracker {
2295 AAICVTrackerFunction(
const IRPosition &IRP, Attributor &
A)
2296 : AAICVTracker(IRP,
A) {}
2299 const std::string getAsStr(Attributor *)
const override {
2300 return "ICVTrackerFunction";
2304 void trackStatistics()
const override {}
2308 return ChangeStatus::UNCHANGED;
2313 InternalControlVar::ICV___last>
2314 ICVReplacementValuesMap;
2321 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2324 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2326 auto &ValuesMap = ICVReplacementValuesMap[ICV];
2328 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
2334 if (ValuesMap.insert(std::make_pair(CI, CI->
getArgOperand(0))).second)
2335 HasChanged = ChangeStatus::CHANGED;
2341 std::optional<Value *> ReplVal = getValueForCall(
A,
I, ICV);
2342 if (ReplVal && ValuesMap.insert(std::make_pair(&
I, *ReplVal)).second)
2343 HasChanged = ChangeStatus::CHANGED;
2349 SetterRFI.foreachUse(TrackValues,
F);
2351 bool UsedAssumedInformation =
false;
2352 A.checkForAllInstructions(CallCheck, *
this, {Instruction::Call},
2353 UsedAssumedInformation,
2359 if (HasChanged == ChangeStatus::CHANGED)
2360 ValuesMap.try_emplace(Entry);
2368 std::optional<Value *> getValueForCall(Attributor &
A,
const Instruction &
I,
2372 if (!CB || CB->hasFnAttr(
"no_openmp") ||
2373 CB->hasFnAttr(
"no_openmp_routines") ||
2374 CB->hasFnAttr(
"no_openmp_constructs"))
2375 return std::nullopt;
2377 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2378 auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
2379 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2380 Function *CalledFunction = CB->getCalledFunction();
2383 if (CalledFunction ==
nullptr)
2385 if (CalledFunction == GetterRFI.Declaration)
2386 return std::nullopt;
2387 if (CalledFunction == SetterRFI.Declaration) {
2388 if (ICVReplacementValuesMap[ICV].
count(&
I))
2389 return ICVReplacementValuesMap[ICV].lookup(&
I);
2398 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2401 if (ICVTrackingAA->isAssumedTracked()) {
2402 std::optional<Value *> URV =
2403 ICVTrackingAA->getUniqueReplacementValue(ICV);
2414 std::optional<Value *>
2416 return std::nullopt;
2421 const Instruction *
I,
2422 Attributor &
A)
const override {
2423 const auto &ValuesMap = ICVReplacementValuesMap[ICV];
2424 if (ValuesMap.count(
I))
2425 return ValuesMap.lookup(
I);
2428 SmallPtrSet<const Instruction *, 16> Visited;
2431 std::optional<Value *> ReplVal;
2433 while (!Worklist.
empty()) {
2435 if (!Visited.
insert(CurrInst).second)
2443 if (ValuesMap.count(CurrInst)) {
2444 std::optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst);
2447 ReplVal = NewReplVal;
2453 if (ReplVal != NewReplVal)
2459 std::optional<Value *> NewReplVal = getValueForCall(
A, *CurrInst, ICV);
2465 ReplVal = NewReplVal;
2471 if (ReplVal != NewReplVal)
2476 if (CurrBB ==
I->getParent() && ReplVal)
2481 if (
const Instruction *Terminator = Pred->getTerminator())
2489struct AAICVTrackerFunctionReturned : AAICVTracker {
2490 AAICVTrackerFunctionReturned(
const IRPosition &IRP, Attributor &
A)
2491 : AAICVTracker(IRP,
A) {}
2494 const std::string getAsStr(Attributor *)
const override {
2495 return "ICVTrackerFunctionReturned";
2499 void trackStatistics()
const override {}
2503 return ChangeStatus::UNCHANGED;
2508 InternalControlVar::ICV___last>
2509 ICVReplacementValuesMap;
2512 std::optional<Value *>
2514 return ICVReplacementValuesMap[ICV];
2519 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2522 if (!ICVTrackingAA->isAssumedTracked())
2523 return indicatePessimisticFixpoint();
2526 std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2527 std::optional<Value *> UniqueICVValue;
2530 std::optional<Value *> NewReplVal =
2531 ICVTrackingAA->getReplacementValue(ICV, &
I,
A);
2534 if (UniqueICVValue && UniqueICVValue != NewReplVal)
2537 UniqueICVValue = NewReplVal;
2542 bool UsedAssumedInformation =
false;
2543 if (!
A.checkForAllInstructions(CheckReturnInst, *
this, {Instruction::Ret},
2544 UsedAssumedInformation,
2546 UniqueICVValue =
nullptr;
2548 if (UniqueICVValue == ReplVal)
2551 ReplVal = UniqueICVValue;
2552 Changed = ChangeStatus::CHANGED;
2559struct AAICVTrackerCallSite : AAICVTracker {
2560 AAICVTrackerCallSite(
const IRPosition &IRP, Attributor &
A)
2561 : AAICVTracker(IRP,
A) {}
2564 assert(getAnchorScope() &&
"Expected anchor function");
2568 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2570 auto ICVInfo = OMPInfoCache.ICVs[ICV];
2571 auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter];
2572 if (Getter.Declaration == getAssociatedFunction()) {
2573 AssociatedICV = ICVInfo.Kind;
2579 indicatePessimisticFixpoint();
2583 if (!ReplVal || !*ReplVal)
2584 return ChangeStatus::UNCHANGED;
2587 A.deleteAfterManifest(*getCtxI());
2589 return ChangeStatus::CHANGED;
2593 const std::string getAsStr(Attributor *)
const override {
2594 return "ICVTrackerCallSite";
2598 void trackStatistics()
const override {}
2601 std::optional<Value *> ReplVal;
2604 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2608 if (!ICVTrackingAA->isAssumedTracked())
2609 return indicatePessimisticFixpoint();
2611 std::optional<Value *> NewReplVal =
2612 ICVTrackingAA->getReplacementValue(AssociatedICV, getCtxI(),
A);
2614 if (ReplVal == NewReplVal)
2615 return ChangeStatus::UNCHANGED;
2617 ReplVal = NewReplVal;
2618 return ChangeStatus::CHANGED;
2623 std::optional<Value *>
2629struct AAICVTrackerCallSiteReturned : AAICVTracker {
2630 AAICVTrackerCallSiteReturned(
const IRPosition &IRP, Attributor &
A)
2631 : AAICVTracker(IRP,
A) {}
2634 const std::string getAsStr(Attributor *)
const override {
2635 return "ICVTrackerCallSiteReturned";
2639 void trackStatistics()
const override {}
2643 return ChangeStatus::UNCHANGED;
2648 InternalControlVar::ICV___last>
2649 ICVReplacementValuesMap;
2653 std::optional<Value *>
2655 return ICVReplacementValuesMap[ICV];
2660 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2662 DepClassTy::REQUIRED);
2665 if (!ICVTrackingAA->isAssumedTracked())
2666 return indicatePessimisticFixpoint();
2669 std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2670 std::optional<Value *> NewReplVal =
2671 ICVTrackingAA->getUniqueReplacementValue(ICV);
2673 if (ReplVal == NewReplVal)
2676 ReplVal = NewReplVal;
2677 Changed = ChangeStatus::CHANGED;
2685static bool hasFunctionEndAsUniqueSuccessor(
const BasicBlock *BB) {
2691 return hasFunctionEndAsUniqueSuccessor(
Successor);
2694struct AAExecutionDomainFunction :
public AAExecutionDomain {
2695 AAExecutionDomainFunction(
const IRPosition &IRP, Attributor &
A)
2696 : AAExecutionDomain(IRP,
A) {}
2698 ~AAExecutionDomainFunction()
override {
delete RPOT; }
2702 assert(
F &&
"Expected anchor function");
2703 RPOT =
new ReversePostOrderTraversal<Function *>(
F);
2706 const std::string getAsStr(Attributor *)
const override {
2707 unsigned TotalBlocks = 0, InitialThreadBlocks = 0, AlignedBlocks = 0;
2708 for (
auto &It : BEDMap) {
2712 InitialThreadBlocks += It.getSecond().IsExecutedByInitialThreadOnly;
2713 AlignedBlocks += It.getSecond().IsReachedFromAlignedBarrierOnly &&
2714 It.getSecond().IsReachingAlignedBarrierOnly;
2716 return "[AAExecutionDomain] " + std::to_string(InitialThreadBlocks) +
"/" +
2717 std::to_string(AlignedBlocks) +
" of " +
2718 std::to_string(TotalBlocks) +
2719 " executed by initial thread / aligned";
2723 void trackStatistics()
const override {}
2727 for (
const BasicBlock &BB : *getAnchorScope()) {
2728 if (!isExecutedByInitialThreadOnly(BB))
2730 dbgs() <<
TAG <<
" Basic block @" << getAnchorScope()->getName() <<
" "
2731 << BB.
getName() <<
" is executed by a single thread.\n";
2740 SmallPtrSet<CallBase *, 16> DeletedBarriers;
2741 auto HandleAlignedBarrier = [&](CallBase *CB) {
2742 const ExecutionDomainTy &ED = CB ? CEDMap[{CB, PRE}] : BEDMap[
nullptr];
2743 if (!ED.IsReachedFromAlignedBarrierOnly ||
2744 ED.EncounteredNonLocalSideEffect)
2746 if (!ED.EncounteredAssumes.empty() && !
A.isModulePass())
2757 DeletedBarriers.
insert(CB);
2758 A.deleteAfterManifest(*CB);
2759 ++NumBarriersEliminated;
2760 Changed = ChangeStatus::CHANGED;
2761 }
else if (!ED.AlignedBarriers.empty()) {
2762 Changed = ChangeStatus::CHANGED;
2764 ED.AlignedBarriers.end());
2765 SmallSetVector<CallBase *, 16> Visited;
2766 while (!Worklist.
empty()) {
2768 if (!Visited.
insert(LastCB))
2772 if (!hasFunctionEndAsUniqueSuccessor(LastCB->
getParent()))
2774 if (!DeletedBarriers.
count(LastCB)) {
2775 ++NumBarriersEliminated;
2776 A.deleteAfterManifest(*LastCB);
2782 const ExecutionDomainTy &LastED = CEDMap[{LastCB, PRE}];
2783 Worklist.
append(LastED.AlignedBarriers.begin(),
2784 LastED.AlignedBarriers.end());
2790 if (!ED.EncounteredAssumes.empty() && (CB || !ED.AlignedBarriers.empty()))
2791 for (
auto *AssumeCB : ED.EncounteredAssumes)
2792 A.deleteAfterManifest(*AssumeCB);
2795 for (
auto *CB : AlignedBarriers)
2796 HandleAlignedBarrier(CB);
2800 HandleAlignedBarrier(
nullptr);
2805 bool isNoOpFence(
const FenceInst &FI)
const override {
2806 return getState().isValidState() && !NonNoOpFences.count(&FI);
2812 mergeInPredecessorBarriersAndAssumptions(Attributor &
A, ExecutionDomainTy &ED,
2813 const ExecutionDomainTy &PredED);
2818 bool mergeInPredecessor(Attributor &
A, ExecutionDomainTy &ED,
2819 const ExecutionDomainTy &PredED,
2820 bool InitialEdgeOnly =
false);
2823 bool handleCallees(Attributor &
A, ExecutionDomainTy &EntryBBED);
2830 bool isExecutedByInitialThreadOnly(
const BasicBlock &BB)
const override {
2831 if (!isValidState())
2833 assert(BB.
getParent() == getAnchorScope() &&
"Block is out of scope!");
2834 return BEDMap.lookup(&BB).IsExecutedByInitialThreadOnly;
2837 bool isExecutedInAlignedRegion(Attributor &
A,
2838 const Instruction &
I)
const override {
2839 assert(
I.getFunction() == getAnchorScope() &&
2840 "Instruction is out of scope!");
2841 if (!isValidState())
2844 bool ForwardIsOk =
true;
2853 if (CB != &
I && AlignedBarriers.contains(
const_cast<CallBase *
>(CB)))
2855 const auto &It = CEDMap.find({CB, PRE});
2856 if (It == CEDMap.end())
2858 if (!It->getSecond().IsReachingAlignedBarrierOnly)
2859 ForwardIsOk =
false;
2863 if (!CurI && !BEDMap.lookup(
I.getParent()).IsReachingAlignedBarrierOnly)
2864 ForwardIsOk =
false;
2872 if (CB != &
I && AlignedBarriers.contains(
const_cast<CallBase *
>(CB)))
2874 const auto &It = CEDMap.find({CB, POST});
2875 if (It == CEDMap.end())
2877 if (It->getSecond().IsReachedFromAlignedBarrierOnly)
2890 return BEDMap.lookup(
nullptr).IsReachedFromAlignedBarrierOnly;
2892 return BEDMap.lookup(PredBB).IsReachedFromAlignedBarrierOnly;
2902 ExecutionDomainTy getExecutionDomain(
const BasicBlock &BB)
const override {
2904 "No request should be made against an invalid state!");
2905 return BEDMap.lookup(&BB);
2907 std::pair<ExecutionDomainTy, ExecutionDomainTy>
2908 getExecutionDomain(
const CallBase &CB)
const override {
2910 "No request should be made against an invalid state!");
2911 return {CEDMap.lookup({&CB, PRE}), CEDMap.lookup({&CB, POST})};
2913 ExecutionDomainTy getFunctionExecutionDomain()
const override {
2915 "No request should be made against an invalid state!");
2916 return InterProceduralED;
2922 static bool isInitialThreadOnlyEdge(Attributor &
A, BranchInst *
Edge,
2923 BasicBlock &SuccessorBB) {
2924 if (!
Edge || !
Edge->isConditional())
2926 if (
Edge->getSuccessor(0) != &SuccessorBB)
2930 if (!Cmp || !
Cmp->isTrueWhenEqual() || !
Cmp->isEquality())
2938 if (
C->isAllOnesValue()) {
2940 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2941 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
2942 CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr;
2945 ConstantStruct *KernelEnvC =
2947 ConstantInt *ExecModeC =
2948 KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);
2955 if (
II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_tid_x)
2960 if (
II->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x)
2968 ExecutionDomainTy InterProceduralED;
2972 DenseMap<const BasicBlock *, ExecutionDomainTy> BEDMap;
2973 DenseMap<PointerIntPair<const CallBase *, 1, Direction>, ExecutionDomainTy>
2975 SmallSetVector<CallBase *, 16> AlignedBarriers;
2977 ReversePostOrderTraversal<Function *> *RPOT =
nullptr;
2980 static bool setAndRecord(
bool &R,
bool V) {
2988 SmallPtrSet<const FenceInst *, 8> NonNoOpFences;
2991void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions(
2992 Attributor &
A, ExecutionDomainTy &ED,
const ExecutionDomainTy &PredED) {
2993 for (
auto *EA : PredED.EncounteredAssumes)
2994 ED.addAssumeInst(
A, *EA);
2996 for (
auto *AB : PredED.AlignedBarriers)
2997 ED.addAlignedBarrier(
A, *AB);
3000bool AAExecutionDomainFunction::mergeInPredecessor(
3001 Attributor &
A, ExecutionDomainTy &ED,
const ExecutionDomainTy &PredED,
3002 bool InitialEdgeOnly) {
3006 setAndRecord(ED.IsExecutedByInitialThreadOnly,
3007 InitialEdgeOnly || (PredED.IsExecutedByInitialThreadOnly &&
3008 ED.IsExecutedByInitialThreadOnly));
3010 Changed |= setAndRecord(ED.IsReachedFromAlignedBarrierOnly,
3011 ED.IsReachedFromAlignedBarrierOnly &&
3012 PredED.IsReachedFromAlignedBarrierOnly);
3013 Changed |= setAndRecord(ED.EncounteredNonLocalSideEffect,
3014 ED.EncounteredNonLocalSideEffect |
3015 PredED.EncounteredNonLocalSideEffect);
3017 if (ED.IsReachedFromAlignedBarrierOnly)
3018 mergeInPredecessorBarriersAndAssumptions(
A, ED, PredED);
3020 ED.clearAssumeInstAndAlignedBarriers();
3024bool AAExecutionDomainFunction::handleCallees(Attributor &
A,
3025 ExecutionDomainTy &EntryBBED) {
3027 auto PredForCallSite = [&](AbstractCallSite ACS) {
3028 const auto *EDAA =
A.getAAFor<AAExecutionDomain>(
3030 DepClassTy::OPTIONAL);
3031 if (!EDAA || !EDAA->getState().isValidState())
3034 EDAA->getExecutionDomain(*
cast<CallBase>(ACS.getInstruction())));
3038 ExecutionDomainTy ExitED;
3039 bool AllCallSitesKnown;
3040 if (
A.checkForAllCallSites(PredForCallSite, *
this,
3042 AllCallSitesKnown)) {
3043 for (
const auto &[CSInED, CSOutED] : CallSiteEDs) {
3044 mergeInPredecessor(
A, EntryBBED, CSInED);
3045 ExitED.IsReachingAlignedBarrierOnly &=
3046 CSOutED.IsReachingAlignedBarrierOnly;
3053 EntryBBED.IsExecutedByInitialThreadOnly =
false;
3054 EntryBBED.IsReachedFromAlignedBarrierOnly =
true;
3055 EntryBBED.EncounteredNonLocalSideEffect =
false;
3056 ExitED.IsReachingAlignedBarrierOnly =
false;
3058 EntryBBED.IsExecutedByInitialThreadOnly =
false;
3059 EntryBBED.IsReachedFromAlignedBarrierOnly =
false;
3060 EntryBBED.EncounteredNonLocalSideEffect =
true;
3061 ExitED.IsReachingAlignedBarrierOnly =
false;
3066 auto &FnED = BEDMap[
nullptr];
3067 Changed |= setAndRecord(FnED.IsReachedFromAlignedBarrierOnly,
3068 FnED.IsReachedFromAlignedBarrierOnly &
3069 EntryBBED.IsReachedFromAlignedBarrierOnly);
3070 Changed |= setAndRecord(FnED.IsReachingAlignedBarrierOnly,
3071 FnED.IsReachingAlignedBarrierOnly &
3072 ExitED.IsReachingAlignedBarrierOnly);
3073 Changed |= setAndRecord(FnED.IsExecutedByInitialThreadOnly,
3074 EntryBBED.IsExecutedByInitialThreadOnly);
3078ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &
A) {
3085 auto HandleAlignedBarrier = [&](CallBase &CB, ExecutionDomainTy &ED) {
3086 Changed |= AlignedBarriers.insert(&CB);
3088 auto &CallInED = CEDMap[{&CB, PRE}];
3089 Changed |= mergeInPredecessor(
A, CallInED, ED);
3090 CallInED.IsReachingAlignedBarrierOnly =
true;
3092 ED.EncounteredNonLocalSideEffect =
false;
3093 ED.IsReachedFromAlignedBarrierOnly =
true;
3095 ED.clearAssumeInstAndAlignedBarriers();
3096 ED.addAlignedBarrier(
A, CB);
3097 auto &CallOutED = CEDMap[{&CB, POST}];
3098 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3102 A.getAAFor<AAIsDead>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
3109 for (
auto &RIt : *RPOT) {
3112 bool IsEntryBB = &BB == &EntryBB;
3115 bool AlignedBarrierLastInBlock = IsEntryBB && IsKernel;
3116 bool IsExplicitlyAligned = IsEntryBB && IsKernel;
3117 ExecutionDomainTy ED;
3124 if (LivenessAA && LivenessAA->isAssumedDead(&BB))
3128 if (LivenessAA && LivenessAA->isEdgeDead(PredBB, &BB))
3130 bool InitialEdgeOnly = isInitialThreadOnlyEdge(
3132 mergeInPredecessor(
A, ED, BEDMap[PredBB], InitialEdgeOnly);
3138 for (Instruction &
I : BB) {
3139 bool UsedAssumedInformation;
3140 if (
A.isAssumedDead(
I, *
this, LivenessAA, UsedAssumedInformation,
3141 false, DepClassTy::OPTIONAL,
3149 ED.addAssumeInst(
A, *AI);
3153 if (
II->isAssumeLikeIntrinsic())
3158 if (!ED.EncounteredNonLocalSideEffect) {
3160 if (ED.IsReachedFromAlignedBarrierOnly)
3165 case AtomicOrdering::NotAtomic:
3167 case AtomicOrdering::Unordered:
3169 case AtomicOrdering::Monotonic:
3171 case AtomicOrdering::Acquire:
3173 case AtomicOrdering::Release:
3175 case AtomicOrdering::AcquireRelease:
3177 case AtomicOrdering::SequentiallyConsistent:
3181 NonNoOpFences.insert(FI);
3186 bool IsAlignedBarrier =
3190 AlignedBarrierLastInBlock &= IsNoSync;
3191 IsExplicitlyAligned &= IsNoSync;
3197 if (IsAlignedBarrier) {
3198 HandleAlignedBarrier(*CB, ED);
3199 AlignedBarrierLastInBlock =
true;
3200 IsExplicitlyAligned =
true;
3206 if (!ED.EncounteredNonLocalSideEffect &&
3208 ED.EncounteredNonLocalSideEffect =
true;
3210 ED.IsReachedFromAlignedBarrierOnly =
false;
3218 auto &CallInED = CEDMap[{CB, PRE}];
3219 Changed |= mergeInPredecessor(
A, CallInED, ED);
3225 if (!IsNoSync && Callee && !
Callee->isDeclaration()) {
3226 const auto *EDAA =
A.getAAFor<AAExecutionDomain>(
3228 if (EDAA && EDAA->getState().isValidState()) {
3229 const auto &CalleeED = EDAA->getFunctionExecutionDomain();
3230 ED.IsReachedFromAlignedBarrierOnly =
3231 CalleeED.IsReachedFromAlignedBarrierOnly;
3232 AlignedBarrierLastInBlock = ED.IsReachedFromAlignedBarrierOnly;
3233 if (IsNoSync || !CalleeED.IsReachedFromAlignedBarrierOnly)
3234 ED.EncounteredNonLocalSideEffect |=
3235 CalleeED.EncounteredNonLocalSideEffect;
3237 ED.EncounteredNonLocalSideEffect =
3238 CalleeED.EncounteredNonLocalSideEffect;
3239 if (!CalleeED.IsReachingAlignedBarrierOnly) {
3241 setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3244 if (CalleeED.IsReachedFromAlignedBarrierOnly)
3245 mergeInPredecessorBarriersAndAssumptions(
A, ED, CalleeED);
3246 auto &CallOutED = CEDMap[{CB, POST}];
3247 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3252 ED.IsReachedFromAlignedBarrierOnly =
false;
3253 Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3256 AlignedBarrierLastInBlock &= ED.IsReachedFromAlignedBarrierOnly;
3258 auto &CallOutED = CEDMap[{CB, POST}];
3259 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3262 if (!
I.mayHaveSideEffects() && !
I.mayReadFromMemory())
3268 const auto *MemAA =
A.getAAFor<AAMemoryLocation>(
3276 if (MemAA && MemAA->getState().isValidState() &&
3277 MemAA->checkForAllAccessesToMemoryKind(
3282 auto &InfoCache =
A.getInfoCache();
3283 if (!
I.mayHaveSideEffects() && InfoCache.isOnlyUsedByAssume(
I))
3287 if (LI->hasMetadata(LLVMContext::MD_invariant_load))
3290 if (!ED.EncounteredNonLocalSideEffect &&
3292 ED.EncounteredNonLocalSideEffect =
true;
3295 bool IsEndAndNotReachingAlignedBarriersOnly =
false;
3297 !BB.getTerminator()->getNumSuccessors()) {
3299 Changed |= mergeInPredecessor(
A, InterProceduralED, ED);
3301 auto &FnED = BEDMap[
nullptr];
3302 if (IsKernel && !IsExplicitlyAligned)
3303 FnED.IsReachingAlignedBarrierOnly =
false;
3304 Changed |= mergeInPredecessor(
A, FnED, ED);
3306 if (!FnED.IsReachingAlignedBarrierOnly) {
3307 IsEndAndNotReachingAlignedBarriersOnly =
true;
3308 SyncInstWorklist.
push_back(BB.getTerminator());
3309 auto &BBED = BEDMap[&BB];
3310 Changed |= setAndRecord(BBED.IsReachingAlignedBarrierOnly,
false);
3314 ExecutionDomainTy &StoredED = BEDMap[&BB];
3315 ED.IsReachingAlignedBarrierOnly = StoredED.IsReachingAlignedBarrierOnly &
3316 !IsEndAndNotReachingAlignedBarriersOnly;
3322 if (ED.IsExecutedByInitialThreadOnly !=
3323 StoredED.IsExecutedByInitialThreadOnly ||
3324 ED.IsReachedFromAlignedBarrierOnly !=
3325 StoredED.IsReachedFromAlignedBarrierOnly ||
3326 ED.EncounteredNonLocalSideEffect !=
3327 StoredED.EncounteredNonLocalSideEffect)
3331 StoredED = std::move(ED);
3336 SmallSetVector<BasicBlock *, 16> Visited;
3337 while (!SyncInstWorklist.
empty()) {
3340 bool HitAlignedBarrierOrKnownEnd =
false;
3345 auto &CallOutED = CEDMap[{CB, POST}];
3346 Changed |= setAndRecord(CallOutED.IsReachingAlignedBarrierOnly,
false);
3347 auto &CallInED = CEDMap[{CB, PRE}];
3348 HitAlignedBarrierOrKnownEnd =
3349 AlignedBarriers.count(CB) || !CallInED.IsReachingAlignedBarrierOnly;
3350 if (HitAlignedBarrierOrKnownEnd)
3352 Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3354 if (HitAlignedBarrierOrKnownEnd)
3358 if (LivenessAA && LivenessAA->isEdgeDead(PredBB, SyncBB))
3360 if (!Visited.
insert(PredBB))
3362 auto &PredED = BEDMap[PredBB];
3363 if (setAndRecord(PredED.IsReachingAlignedBarrierOnly,
false)) {
3365 SyncInstWorklist.
push_back(PredBB->getTerminator());
3368 if (SyncBB != &EntryBB)
3371 setAndRecord(InterProceduralED.IsReachingAlignedBarrierOnly,
false);
3374 return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
3379struct AAHeapToShared :
public StateWrapper<BooleanState, AbstractAttribute> {
3380 using Base = StateWrapper<BooleanState, AbstractAttribute>;
3381 AAHeapToShared(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
3384 static AAHeapToShared &createForPosition(
const IRPosition &IRP,
3388 virtual bool isAssumedHeapToShared(CallBase &CB)
const = 0;
3392 virtual bool isAssumedHeapToSharedRemovedFree(CallBase &CB)
const = 0;
3395 StringRef
getName()
const override {
return "AAHeapToShared"; }
3398 const char *getIdAddr()
const override {
return &
ID; }
3402 static bool classof(
const AbstractAttribute *AA) {
3407 static const char ID;
3410struct AAHeapToSharedFunction :
public AAHeapToShared {
3411 AAHeapToSharedFunction(
const IRPosition &IRP, Attributor &
A)
3412 : AAHeapToShared(IRP,
A) {}
3414 const std::string getAsStr(Attributor *)
const override {
3415 return "[AAHeapToShared] " + std::to_string(MallocCalls.size()) +
3416 " malloc calls eligible.";
3420 void trackStatistics()
const override {}
3424 void findPotentialRemovedFreeCalls(Attributor &
A) {
3425 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3426 auto &FreeRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
3428 PotentialRemovedFreeCalls.clear();
3430 for (CallBase *CB : MallocCalls) {
3432 for (
auto *U : CB->
users()) {
3434 if (
C &&
C->getCalledFunction() == FreeRFI.Declaration)
3438 if (FreeCalls.
size() != 1)
3441 PotentialRemovedFreeCalls.insert(FreeCalls.
front());
3447 indicatePessimisticFixpoint();
3451 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3452 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3453 if (!RFI.Declaration)
3457 [](
const IRPosition &,
const AbstractAttribute *,
3458 bool &) -> std::optional<Value *> {
return nullptr; };
3461 for (User *U : RFI.Declaration->
users())
3465 MallocCalls.insert(CB);
3470 findPotentialRemovedFreeCalls(
A);
3473 bool isAssumedHeapToShared(CallBase &CB)
const override {
3474 return isValidState() && MallocCalls.count(&CB);
3477 bool isAssumedHeapToSharedRemovedFree(CallBase &CB)
const override {
3478 return isValidState() && PotentialRemovedFreeCalls.count(&CB);
3482 if (MallocCalls.empty())
3483 return ChangeStatus::UNCHANGED;
3485 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3486 auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
3490 DepClassTy::OPTIONAL);
3493 for (CallBase *CB : MallocCalls) {
3495 if (HS &&
HS->isAssumedHeapToStack(*CB))
3500 for (
auto *U : CB->
users()) {
3502 if (
C &&
C->getCalledFunction() == FreeCall.Declaration)
3505 if (FreeCalls.
size() != 1)
3512 <<
" with shared memory."
3513 <<
" Shared memory usage is limited to "
3519 <<
" with " << AllocSize->getZExtValue()
3520 <<
" bytes of shared memory\n");
3525 Type *Int8Ty = Type::getInt8Ty(
M->getContext());
3526 Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue());
3527 auto *SharedMem =
new GlobalVariable(
3531 static_cast<unsigned>(AddressSpace::Shared));
3533 SharedMem, PointerType::getUnqual(
M->getContext()));
3535 auto Remark = [&](OptimizationRemark
OR) {
3536 return OR <<
"Replaced globalized variable with "
3537 <<
ore::NV(
"SharedMemory", AllocSize->getZExtValue())
3538 << (AllocSize->isOne() ?
" byte " :
" bytes ")
3539 <<
"of shared memory.";
3541 A.emitRemark<OptimizationRemark>(CB,
"OMP111",
Remark);
3543 MaybeAlign Alignment = CB->getRetAlign();
3545 "HeapToShared on allocation without alignment attribute");
3546 SharedMem->setAlignment(*Alignment);
3549 A.deleteAfterManifest(*CB);
3550 A.deleteAfterManifest(*FreeCalls.
front());
3552 SharedMemoryUsed += AllocSize->getZExtValue();
3553 NumBytesMovedToSharedMemory = SharedMemoryUsed;
3554 Changed = ChangeStatus::CHANGED;
3561 if (MallocCalls.empty())
3562 return indicatePessimisticFixpoint();
3563 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3564 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3565 if (!RFI.Declaration)
3566 return ChangeStatus::UNCHANGED;
3570 auto NumMallocCalls = MallocCalls.size();
3573 for (User *U : RFI.Declaration->
users()) {
3575 if (CB->getCaller() !=
F)
3577 if (!MallocCalls.count(CB))
3580 MallocCalls.remove(CB);
3583 const auto *ED =
A.getAAFor<AAExecutionDomain>(
3585 if (!ED || !ED->isExecutedByInitialThreadOnly(*CB))
3586 MallocCalls.remove(CB);
3590 findPotentialRemovedFreeCalls(
A);
3592 if (NumMallocCalls != MallocCalls.size())
3593 return ChangeStatus::CHANGED;
3595 return ChangeStatus::UNCHANGED;
3599 SmallSetVector<CallBase *, 4> MallocCalls;
3601 SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls;
3603 unsigned SharedMemoryUsed = 0;
3606struct AAKernelInfo :
public StateWrapper<KernelInfoState, AbstractAttribute> {
3607 using Base = StateWrapper<KernelInfoState, AbstractAttribute>;
3608 AAKernelInfo(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
3612 static bool requiresCalleeForCallBase() {
return false; }
3615 void trackStatistics()
const override {}
3618 const std::string getAsStr(Attributor *)
const override {
3619 if (!isValidState())
3621 return std::string(SPMDCompatibilityTracker.isAssumed() ?
"SPMD"
3623 std::string(SPMDCompatibilityTracker.isAtFixpoint() ?
" [FIX]"
3625 std::string(
" #PRs: ") +
3626 (ReachedKnownParallelRegions.isValidState()
3627 ? std::to_string(ReachedKnownParallelRegions.size())
3629 ", #Unknown PRs: " +
3630 (ReachedUnknownParallelRegions.isValidState()
3631 ? std::to_string(ReachedUnknownParallelRegions.size())
3633 ", #Reaching Kernels: " +
3634 (ReachingKernelEntries.isValidState()
3635 ? std::to_string(ReachingKernelEntries.size())
3638 (ParallelLevels.isValidState()
3639 ? std::to_string(ParallelLevels.size())
3641 ", NestedPar: " + (NestedParallelism ?
"yes" :
"no");
3645 static AAKernelInfo &createForPosition(
const IRPosition &IRP, Attributor &
A);
3648 StringRef
getName()
const override {
return "AAKernelInfo"; }
3651 const char *getIdAddr()
const override {
return &
ID; }
3654 static bool classof(
const AbstractAttribute *AA) {
3658 static const char ID;
3663struct AAKernelInfoFunction : AAKernelInfo {
3664 AAKernelInfoFunction(
const IRPosition &IRP, Attributor &
A)
3665 : AAKernelInfo(IRP,
A) {}
3667 SmallPtrSet<Instruction *, 4> GuardedInstructions;
3669 SmallPtrSetImpl<Instruction *> &getGuardedInstructions() {
3670 return GuardedInstructions;
3673 void setConfigurationOfKernelEnvironment(ConstantStruct *ConfigC) {
3675 KernelEnvC, ConfigC, {KernelInfo::ConfigurationIdx});
3676 assert(NewKernelEnvC &&
"Failed to create new kernel environment");
3680#define KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MEMBER) \
3681 void set##MEMBER##OfKernelEnvironment(ConstantInt *NewVal) { \
3682 ConstantStruct *ConfigC = \
3683 KernelInfo::getConfigurationFromKernelEnvironment(KernelEnvC); \
3684 Constant *NewConfigC = ConstantFoldInsertValueInstruction( \
3685 ConfigC, NewVal, {KernelInfo::MEMBER##Idx}); \
3686 assert(NewConfigC && "Failed to create new configuration environment"); \
3687 setConfigurationOfKernelEnvironment(cast<ConstantStruct>(NewConfigC)); \
3698#undef KERNEL_ENVIRONMENT_CONFIGURATION_SETTER
3705 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3709 OMPInformationCache::RuntimeFunctionInfo &InitRFI =
3710 OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
3711 OMPInformationCache::RuntimeFunctionInfo &DeinitRFI =
3712 OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];
3716 auto StoreCallBase = [](
Use &U,
3717 OMPInformationCache::RuntimeFunctionInfo &RFI,
3719 CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI);
3721 "Unexpected use of __kmpc_target_init or __kmpc_target_deinit!");
3723 "Multiple uses of __kmpc_target_init or __kmpc_target_deinit!");
3729 StoreCallBase(U, InitRFI, KernelInitCB);
3733 DeinitRFI.foreachUse(
3735 StoreCallBase(U, DeinitRFI, KernelDeinitCB);
3741 if (!KernelInitCB || !KernelDeinitCB)
3745 ReachingKernelEntries.insert(Fn);
3746 IsKernelEntry =
true;
3754 KernelConfigurationSimplifyCB =
3756 bool &UsedAssumedInformation) -> std::optional<Constant *> {
3757 if (!isAtFixpoint()) {
3760 UsedAssumedInformation =
true;
3766 A.registerGlobalVariableSimplificationCallback(
3767 *KernelEnvGV, KernelConfigurationSimplifyCB);
3770 bool CanChangeToSPMD = OMPInfoCache.runtimeFnsAvailable(
3771 {OMPRTL___kmpc_get_hardware_thread_id_in_block,
3772 OMPRTL___kmpc_barrier_simple_spmd});
3776 KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);
3781 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
3785 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3787 setExecModeOfKernelEnvironment(AssumedExecModeC);
3794 setMinThreadsOfKernelEnvironment(ConstantInt::get(
Int32Ty, MinThreads));
3797 auto [MinTeams, MaxTeams] =
3800 setMinTeamsOfKernelEnvironment(ConstantInt::get(
Int32Ty, MinTeams));
3802 setMaxTeamsOfKernelEnvironment(ConstantInt::get(
Int32Ty, MaxTeams));
3805 KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(KernelEnvC);
3806 ConstantInt *AssumedMayUseNestedParallelismC = ConstantInt::get(
3808 setMayUseNestedParallelismOfKernelEnvironment(
3809 AssumedMayUseNestedParallelismC);
3813 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
3816 ConstantInt::get(UseGenericStateMachineC->
getIntegerType(),
false);
3817 setUseGenericStateMachineOfKernelEnvironment(
3818 AssumedUseGenericStateMachineC);
3824 if (!OMPInfoCache.RFIs[RFKind].Declaration)
3826 A.registerVirtualUseCallback(*OMPInfoCache.RFIs[RFKind].Declaration, CB);
3830 auto AddDependence = [](
Attributor &
A,
const AAKernelInfo *KI,
3847 if (SPMDCompatibilityTracker.isValidState())
3848 return AddDependence(
A,
this, QueryingAA);
3850 if (!ReachedKnownParallelRegions.isValidState())
3851 return AddDependence(
A,
this, QueryingAA);
3857 RegisterVirtualUse(OMPRTL___kmpc_get_hardware_num_threads_in_block,
3858 CustomStateMachineUseCB);
3859 RegisterVirtualUse(OMPRTL___kmpc_get_warp_size, CustomStateMachineUseCB);
3860 RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_generic,
3861 CustomStateMachineUseCB);
3862 RegisterVirtualUse(OMPRTL___kmpc_kernel_parallel,
3863 CustomStateMachineUseCB);
3864 RegisterVirtualUse(OMPRTL___kmpc_kernel_end_parallel,
3865 CustomStateMachineUseCB);
3869 if (SPMDCompatibilityTracker.isAtFixpoint())
3876 if (!SPMDCompatibilityTracker.isValidState())
3877 return AddDependence(
A,
this, QueryingAA);
3880 RegisterVirtualUse(OMPRTL___kmpc_get_hardware_thread_id_in_block,
3889 if (!SPMDCompatibilityTracker.isValidState())
3890 return AddDependence(
A,
this, QueryingAA);
3891 if (SPMDCompatibilityTracker.empty())
3892 return AddDependence(
A,
this, QueryingAA);
3893 if (!mayContainParallelRegion())
3894 return AddDependence(
A,
this, QueryingAA);
3897 RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_spmd, SPMDBarrierUseCB);
3901 static std::string sanitizeForGlobalName(std::string S) {
3905 return !((C >=
'a' && C <=
'z') || (C >=
'A' && C <=
'Z') ||
3906 (C >=
'0' && C <=
'9') || C ==
'_');
3917 if (!KernelInitCB || !KernelDeinitCB)
3918 return ChangeStatus::UNCHANGED;
3922 bool HasBuiltStateMachine =
true;
3923 if (!changeToSPMDMode(
A,
Changed)) {
3925 HasBuiltStateMachine = buildCustomStateMachine(
A,
Changed);
3927 HasBuiltStateMachine =
false;
3931 ConstantStruct *ExistingKernelEnvC =
3933 ConstantInt *OldUseGenericStateMachineVal =
3934 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
3935 ExistingKernelEnvC);
3936 if (!HasBuiltStateMachine)
3937 setUseGenericStateMachineOfKernelEnvironment(
3938 OldUseGenericStateMachineVal);
3941 GlobalVariable *KernelEnvGV =
3945 Changed = ChangeStatus::CHANGED;
3951 void insertInstructionGuardsHelper(Attributor &
A) {
3952 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3954 auto CreateGuardedRegion = [&](
Instruction *RegionStartI,
3956 LoopInfo *LI =
nullptr;
3957 DominatorTree *DT =
nullptr;
3958 MemorySSAUpdater *MSU =
nullptr;
3988 DT, LI, MSU,
"region.guarded.end");
3991 MSU,
"region.barrier");
3994 DT, LI, MSU,
"region.exit");
3996 SplitBlock(ParentBB, RegionStartI, DT, LI, MSU,
"region.guarded");
3999 "Expected a different CFG");
4002 ParentBB, ParentBB->
getTerminator(), DT, LI, MSU,
"region.check.tid");
4005 A.registerManifestAddedBasicBlock(*RegionEndBB);
4006 A.registerManifestAddedBasicBlock(*RegionBarrierBB);
4007 A.registerManifestAddedBasicBlock(*RegionExitBB);
4008 A.registerManifestAddedBasicBlock(*RegionStartBB);
4009 A.registerManifestAddedBasicBlock(*RegionCheckTidBB);
4011 bool HasBroadcastValues =
false;
4014 for (Instruction &
I : *RegionStartBB) {
4016 for (Use &U :
I.uses()) {
4022 if (OutsideUses.
empty())
4025 HasBroadcastValues =
true;
4029 auto *SharedMem =
new GlobalVariable(
4030 M,
I.getType(),
false,
4032 sanitizeForGlobalName(
4033 (
I.getName() +
".guarded.output.alloc").str()),
4035 static_cast<unsigned>(AddressSpace::Shared));
4038 new StoreInst(&
I, SharedMem,
4041 LoadInst *LoadI =
new LoadInst(
4042 I.getType(), SharedMem,
I.getName() +
".guarded.output.load",
4046 for (Use *U : OutsideUses)
4047 A.changeUseAfterManifest(*U, *LoadI);
4050 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4055 OpenMPIRBuilder::LocationDescription Loc(
4056 InsertPointTy(ParentBB, ParentBB->
end()),
DL);
4058 uint32_t SrcLocStrSize;
4067 OpenMPIRBuilder::LocationDescription LocRegionCheckTid(
4068 InsertPointTy(RegionCheckTidBB, RegionCheckTidBB->
end()),
DL);
4070 FunctionCallee HardwareTidFn =
4072 M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
4076 OMPInfoCache.setCallingConvention(HardwareTidFn, Tid);
4078 OMPInfoCache.OMPBuilder.
Builder
4079 .
CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB)
4084 FunctionCallee BarrierFn =
4086 M, OMPRTL___kmpc_barrier_simple_spmd);
4092 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4095 if (HasBroadcastValues) {
4100 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4104 auto &AllocSharedRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
4105 SmallPtrSet<BasicBlock *, 8> Visited;
4106 for (Instruction *GuardedI : SPMDCompatibilityTracker) {
4108 if (!Visited.
insert(BB).second)
4114 while (++IP != IPEnd) {
4115 if (!IP->mayHaveSideEffects() && !IP->mayReadFromMemory())
4118 if (OpenMPOpt::getCallIfRegularCall(*
I, &AllocSharedRFI))
4120 if (!
I->user_empty() || !SPMDCompatibilityTracker.contains(
I)) {
4121 LastEffect =
nullptr;
4128 for (
auto &Reorder : Reorders)
4129 Reorder.first->moveBefore(Reorder.second->getIterator());
4134 for (Instruction *GuardedI : SPMDCompatibilityTracker) {
4136 auto *CalleeAA =
A.lookupAAFor<AAKernelInfo>(
4139 assert(CalleeAA !=
nullptr &&
"Expected Callee AAKernelInfo");
4142 if (CalleeAAFunction.getGuardedInstructions().contains(GuardedI))
4145 Instruction *GuardedRegionStart =
nullptr, *GuardedRegionEnd =
nullptr;
4146 for (Instruction &
I : *BB) {
4149 if (SPMDCompatibilityTracker.contains(&
I)) {
4150 CalleeAAFunction.getGuardedInstructions().insert(&
I);
4151 if (GuardedRegionStart)
4152 GuardedRegionEnd = &
I;
4154 GuardedRegionStart = GuardedRegionEnd = &
I;
4161 if (GuardedRegionStart) {
4163 std::make_pair(GuardedRegionStart, GuardedRegionEnd));
4164 GuardedRegionStart =
nullptr;
4165 GuardedRegionEnd =
nullptr;
4170 for (
auto &GR : GuardedRegions)
4171 CreateGuardedRegion(GR.first, GR.second);
4174 void forceSingleThreadPerWorkgroupHelper(Attributor &
A) {
4183 auto &Ctx = getAnchorValue().getContext();
4190 KernelInitCB->
getNextNode(),
"main.thread.user_code");
4195 A.registerManifestAddedBasicBlock(*InitBB);
4196 A.registerManifestAddedBasicBlock(*UserCodeBB);
4197 A.registerManifestAddedBasicBlock(*ReturnBB);
4206 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4207 FunctionCallee ThreadIdInBlockFn =
4209 M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
4212 CallInst *ThreadIdInBlock =
4214 OMPInfoCache.setCallingConvention(ThreadIdInBlockFn, ThreadIdInBlock);
4220 ConstantInt::get(ThreadIdInBlock->
getType(), 0),
4221 "thread.is_main", InitBB);
4227 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4229 if (!SPMDCompatibilityTracker.isAssumed()) {
4230 for (Instruction *NonCompatibleI : SPMDCompatibilityTracker) {
4231 if (!NonCompatibleI)
4236 if (OMPInfoCache.RTLFunctions.contains(CB->getCalledFunction()))
4239 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
4240 ORA <<
"Value has potential side effects preventing SPMD-mode "
4243 ORA <<
". Add `[[omp::assume(\"ompx_spmd_amenable\")]]` to "
4244 "the called function to override";
4248 A.emitRemark<OptimizationRemarkAnalysis>(NonCompatibleI,
"OMP121",
4252 << *NonCompatibleI <<
"\n");
4264 Kernel = CB->getCaller();
4269 ConstantStruct *ExistingKernelEnvC =
4272 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC);
4278 Changed = ChangeStatus::CHANGED;
4282 if (mayContainParallelRegion())
4283 insertInstructionGuardsHelper(
A);
4285 forceSingleThreadPerWorkgroupHelper(
A);
4290 "Initially non-SPMD kernel has SPMD exec mode!");
4291 setExecModeOfKernelEnvironment(
4295 ++NumOpenMPTargetRegionKernelsSPMD;
4297 auto Remark = [&](OptimizationRemark
OR) {
4298 return OR <<
"Transformed generic-mode kernel to SPMD-mode.";
4300 A.emitRemark<OptimizationRemark>(KernelInitCB,
"OMP120",
Remark);
4310 if (!ReachedKnownParallelRegions.isValidState())
4313 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4314 if (!OMPInfoCache.runtimeFnsAvailable(
4315 {OMPRTL___kmpc_get_hardware_num_threads_in_block,
4316 OMPRTL___kmpc_get_warp_size, OMPRTL___kmpc_barrier_simple_generic,
4317 OMPRTL___kmpc_kernel_parallel, OMPRTL___kmpc_kernel_end_parallel}))
4320 ConstantStruct *ExistingKernelEnvC =
4327 ConstantInt *UseStateMachineC =
4328 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
4329 ExistingKernelEnvC);
4330 ConstantInt *ModeC =
4331 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC);
4336 if (UseStateMachineC->
isZero() ||
4340 Changed = ChangeStatus::CHANGED;
4343 setUseGenericStateMachineOfKernelEnvironment(
4350 if (!mayContainParallelRegion()) {
4351 ++NumOpenMPTargetRegionKernelsWithoutStateMachine;
4353 auto Remark = [&](OptimizationRemark
OR) {
4354 return OR <<
"Removing unused state machine from generic-mode kernel.";
4356 A.emitRemark<OptimizationRemark>(KernelInitCB,
"OMP130",
Remark);
4362 if (ReachedUnknownParallelRegions.empty()) {
4363 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback;
4365 auto Remark = [&](OptimizationRemark
OR) {
4366 return OR <<
"Rewriting generic-mode kernel with a customized state "
4369 A.emitRemark<OptimizationRemark>(KernelInitCB,
"OMP131",
Remark);
4371 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback;
4373 auto Remark = [&](OptimizationRemarkAnalysis
OR) {
4374 return OR <<
"Generic-mode kernel is executed with a customized state "
4375 "machine that requires a fallback.";
4377 A.emitRemark<OptimizationRemarkAnalysis>(KernelInitCB,
"OMP132",
Remark);
4380 for (CallBase *UnknownParallelRegionCB : ReachedUnknownParallelRegions) {
4381 if (!UnknownParallelRegionCB)
4383 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
4384 return ORA <<
"Call may contain unknown parallel regions. Use "
4385 <<
"`[[omp::assume(\"omp_no_parallelism\")]]` to "
4388 A.emitRemark<OptimizationRemarkAnalysis>(UnknownParallelRegionCB,
4423 auto &Ctx = getAnchorValue().getContext();
4427 BasicBlock *InitBB = KernelInitCB->getParent();
4429 KernelInitCB->getNextNode(),
"thread.user_code.check");
4433 Ctx,
"worker_state_machine.begin",
Kernel, UserCodeEntryBB);
4435 Ctx,
"worker_state_machine.finished",
Kernel, UserCodeEntryBB);
4437 Ctx,
"worker_state_machine.is_active.check",
Kernel, UserCodeEntryBB);
4440 Kernel, UserCodeEntryBB);
4443 Kernel, UserCodeEntryBB);
4445 Ctx,
"worker_state_machine.done.barrier",
Kernel, UserCodeEntryBB);
4446 A.registerManifestAddedBasicBlock(*InitBB);
4447 A.registerManifestAddedBasicBlock(*UserCodeEntryBB);
4448 A.registerManifestAddedBasicBlock(*IsWorkerCheckBB);
4449 A.registerManifestAddedBasicBlock(*StateMachineBeginBB);
4450 A.registerManifestAddedBasicBlock(*StateMachineFinishedBB);
4451 A.registerManifestAddedBasicBlock(*StateMachineIsActiveCheckBB);
4452 A.registerManifestAddedBasicBlock(*StateMachineIfCascadeCurrentBB);
4453 A.registerManifestAddedBasicBlock(*StateMachineEndParallelBB);
4454 A.registerManifestAddedBasicBlock(*StateMachineDoneBarrierBB);
4456 const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
4462 ConstantInt::getAllOnesValue(KernelInitCB->getType()),
4463 "thread.is_worker", InitBB);
4468 FunctionCallee BlockHwSizeFn =
4470 M, OMPRTL___kmpc_get_hardware_num_threads_in_block);
4471 FunctionCallee WarpSizeFn =
4473 M, OMPRTL___kmpc_get_warp_size);
4474 CallInst *BlockHwSize =
4476 OMPInfoCache.setCallingConvention(BlockHwSizeFn, BlockHwSize);
4478 CallInst *WarpSize =
4480 OMPInfoCache.setCallingConvention(WarpSizeFn, WarpSize);
4483 BlockHwSize, WarpSize,
"block.size", IsWorkerCheckBB);
4487 "thread.is_main_or_worker", IsWorkerCheckBB);
4490 IsMainOrWorker, IsWorkerCheckBB);
4493 const DataLayout &
DL =
M.getDataLayout();
4494 Type *VoidPtrTy = PointerType::getUnqual(Ctx);
4496 new AllocaInst(VoidPtrTy,
DL.getAllocaAddrSpace(),
nullptr,
4501 OpenMPIRBuilder::LocationDescription(
4502 IRBuilder<>::InsertPoint(StateMachineBeginBB,
4503 StateMachineBeginBB->
end()),
4506 Value *Ident = KernelInfo::getIdentFromKernelEnvironment(KernelEnvC);
4507 Value *GTid = KernelInitCB;
4509 FunctionCallee BarrierFn =
4511 M, OMPRTL___kmpc_barrier_simple_generic);
4514 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4518 (
unsigned int)AddressSpace::Generic) {
4519 WorkFnAI =
new AddrSpaceCastInst(
4520 WorkFnAI, PointerType::get(Ctx, (
unsigned int)AddressSpace::Generic),
4521 WorkFnAI->
getName() +
".generic", StateMachineBeginBB);
4525 FunctionCallee KernelParallelFn =
4527 M, OMPRTL___kmpc_kernel_parallel);
4529 KernelParallelFn, {WorkFnAI},
"worker.is_active", StateMachineBeginBB);
4530 OMPInfoCache.setCallingConvention(KernelParallelFn, IsActiveWorker);
4532 Instruction *WorkFn =
new LoadInst(VoidPtrTy, WorkFnAI,
"worker.work_fn",
4533 StateMachineBeginBB);
4536 FunctionType *ParallelRegionFnTy = FunctionType::get(
4537 Type::getVoidTy(Ctx), {Type::getInt16Ty(Ctx), Type::getInt32Ty(Ctx)},
4543 StateMachineBeginBB);
4544 IsDone->setDebugLoc(DLoc);
4546 IsDone, StateMachineBeginBB)
4550 StateMachineDoneBarrierBB, IsActiveWorker,
4551 StateMachineIsActiveCheckBB)
4557 const unsigned int WrapperFunctionArgNo = 6;
4562 for (
int I = 0,
E = ReachedKnownParallelRegions.size();
I <
E; ++
I) {
4563 auto *CB = ReachedKnownParallelRegions[
I];
4565 CB->getArgOperand(WrapperFunctionArgNo)->stripPointerCasts());
4567 Ctx,
"worker_state_machine.parallel_region.execute",
Kernel,
4568 StateMachineEndParallelBB);
4570 ->setDebugLoc(DLoc);
4576 Kernel, StateMachineEndParallelBB);
4577 A.registerManifestAddedBasicBlock(*PRExecuteBB);
4578 A.registerManifestAddedBasicBlock(*PRNextBB);
4583 if (
I + 1 <
E || !ReachedUnknownParallelRegions.empty()) {
4586 "worker.check_parallel_region", StateMachineIfCascadeCurrentBB);
4594 StateMachineIfCascadeCurrentBB)
4596 StateMachineIfCascadeCurrentBB = PRNextBB;
4602 if (!ReachedUnknownParallelRegions.empty()) {
4603 StateMachineIfCascadeCurrentBB->
setName(
4604 "worker_state_machine.parallel_region.fallback.execute");
4606 StateMachineIfCascadeCurrentBB)
4607 ->setDebugLoc(DLoc);
4610 StateMachineIfCascadeCurrentBB)
4613 FunctionCallee EndParallelFn =
4615 M, OMPRTL___kmpc_kernel_end_parallel);
4616 CallInst *EndParallel =
4618 OMPInfoCache.setCallingConvention(EndParallelFn, EndParallel);
4624 ->setDebugLoc(DLoc);
4634 KernelInfoState StateBefore = getState();
4640 struct UpdateKernelEnvCRAII {
4641 AAKernelInfoFunction &AA;
4643 UpdateKernelEnvCRAII(AAKernelInfoFunction &AA) : AA(AA) {}
4645 ~UpdateKernelEnvCRAII() {
4649 ConstantStruct *ExistingKernelEnvC =
4652 if (!AA.isValidState()) {
4653 AA.KernelEnvC = ExistingKernelEnvC;
4657 if (!AA.ReachedKnownParallelRegions.isValidState())
4658 AA.setUseGenericStateMachineOfKernelEnvironment(
4659 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
4660 ExistingKernelEnvC));
4662 if (!AA.SPMDCompatibilityTracker.isValidState())
4663 AA.setExecModeOfKernelEnvironment(
4664 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC));
4666 ConstantInt *MayUseNestedParallelismC =
4667 KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(
4669 ConstantInt *NewMayUseNestedParallelismC = ConstantInt::get(
4670 MayUseNestedParallelismC->
getIntegerType(), AA.NestedParallelism);
4671 AA.setMayUseNestedParallelismOfKernelEnvironment(
4672 NewMayUseNestedParallelismC);
4682 if (!
I.mayWriteToMemory())
4685 const auto *UnderlyingObjsAA =
A.getAAFor<AAUnderlyingObjects>(
4687 DepClassTy::OPTIONAL);
4688 auto *
HS =
A.getAAFor<AAHeapToStack>(
4690 DepClassTy::OPTIONAL);
4691 if (UnderlyingObjsAA &&
4692 UnderlyingObjsAA->forallUnderlyingObjects([&](
Value &Obj) {
4693 if (AA::isAssumedThreadLocalObject(A, Obj, *this))
4697 auto *CB = dyn_cast<CallBase>(&Obj);
4698 return CB && HS && HS->isAssumedHeapToStack(*CB);
4704 SPMDCompatibilityTracker.insert(&
I);
4708 bool UsedAssumedInformationInCheckRWInst =
false;
4709 if (!SPMDCompatibilityTracker.isAtFixpoint())
4710 if (!
A.checkForAllReadWriteInstructions(
4711 CheckRWInst, *
this, UsedAssumedInformationInCheckRWInst))
4712 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4714 bool UsedAssumedInformationFromReachingKernels =
false;
4715 if (!IsKernelEntry) {
4716 updateParallelLevels(
A);
4718 bool AllReachingKernelsKnown =
true;
4719 updateReachingKernelEntries(
A, AllReachingKernelsKnown);
4720 UsedAssumedInformationFromReachingKernels = !AllReachingKernelsKnown;
4722 if (!SPMDCompatibilityTracker.empty()) {
4723 if (!ParallelLevels.isValidState())
4724 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4725 else if (!ReachingKernelEntries.isValidState())
4726 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4732 for (
auto *
Kernel : ReachingKernelEntries) {
4733 auto *CBAA =
A.getAAFor<AAKernelInfo>(
4735 if (CBAA && CBAA->SPMDCompatibilityTracker.isValidState() &&
4736 CBAA->SPMDCompatibilityTracker.isAssumed())
4740 if (!CBAA || !CBAA->SPMDCompatibilityTracker.isAtFixpoint())
4741 UsedAssumedInformationFromReachingKernels =
true;
4743 if (SPMD != 0 &&
Generic != 0)
4744 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4750 bool AllParallelRegionStatesWereFixed =
true;
4751 bool AllSPMDStatesWereFixed =
true;
4754 auto *CBAA =
A.getAAFor<AAKernelInfo>(
4758 getState() ^= CBAA->getState();
4759 AllSPMDStatesWereFixed &= CBAA->SPMDCompatibilityTracker.isAtFixpoint();
4760 AllParallelRegionStatesWereFixed &=
4761 CBAA->ReachedKnownParallelRegions.isAtFixpoint();
4762 AllParallelRegionStatesWereFixed &=
4763 CBAA->ReachedUnknownParallelRegions.isAtFixpoint();
4767 bool UsedAssumedInformationInCheckCallInst =
false;
4768 if (!
A.checkForAllCallLikeInstructions(
4769 CheckCallInst, *
this, UsedAssumedInformationInCheckCallInst)) {
4771 <<
"Failed to visit all call-like instructions!\n";);
4772 return indicatePessimisticFixpoint();
4777 if (!UsedAssumedInformationInCheckCallInst &&
4778 AllParallelRegionStatesWereFixed) {
4779 ReachedKnownParallelRegions.indicateOptimisticFixpoint();
4780 ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
4785 if (!UsedAssumedInformationInCheckRWInst &&
4786 !UsedAssumedInformationInCheckCallInst &&
4787 !UsedAssumedInformationFromReachingKernels && AllSPMDStatesWereFixed)
4788 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
4790 return StateBefore == getState() ? ChangeStatus::UNCHANGED
4791 : ChangeStatus::CHANGED;
4796 void updateReachingKernelEntries(Attributor &
A,
4797 bool &AllReachingKernelsKnown) {
4798 auto PredCallSite = [&](AbstractCallSite ACS) {
4801 assert(Caller &&
"Caller is nullptr");
4803 auto *CAA =
A.getOrCreateAAFor<AAKernelInfo>(
4805 if (CAA && CAA->ReachingKernelEntries.isValidState()) {
4806 ReachingKernelEntries ^= CAA->ReachingKernelEntries;
4812 ReachingKernelEntries.indicatePessimisticFixpoint();
4817 if (!
A.checkForAllCallSites(PredCallSite, *
this,
4819 AllReachingKernelsKnown))
4820 ReachingKernelEntries.indicatePessimisticFixpoint();
4824 void updateParallelLevels(Attributor &
A) {
4825 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4826 OMPInformationCache::RuntimeFunctionInfo &Parallel60RFI =
4827 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_60];
4829 auto PredCallSite = [&](AbstractCallSite ACS) {
4832 assert(Caller &&
"Caller is nullptr");
4836 if (CAA && CAA->ParallelLevels.isValidState()) {
4842 if (Caller == Parallel60RFI.Declaration) {
4843 ParallelLevels.indicatePessimisticFixpoint();
4847 ParallelLevels ^= CAA->ParallelLevels;
4854 ParallelLevels.indicatePessimisticFixpoint();
4859 bool AllCallSitesKnown =
true;
4860 if (!
A.checkForAllCallSites(PredCallSite, *
this,
4863 ParallelLevels.indicatePessimisticFixpoint();
4870struct AAKernelInfoCallSite : AAKernelInfo {
4871 AAKernelInfoCallSite(
const IRPosition &IRP, Attributor &
A)
4872 : AAKernelInfo(IRP,
A) {}
4876 AAKernelInfo::initialize(
A);
4879 auto *AssumptionAA =
A.getAAFor<AAAssumptionInfo>(
4883 if (AssumptionAA && AssumptionAA->hasAssumption(
"ompx_spmd_amenable")) {
4884 indicateOptimisticFixpoint();
4892 indicateOptimisticFixpoint();
4901 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4902 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
4903 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
4905 if (!Callee || !
A.isFunctionIPOAmendable(*Callee)) {
4909 if (!AssumptionAA ||
4910 !(AssumptionAA->hasAssumption(
"omp_no_openmp") ||
4911 AssumptionAA->hasAssumption(
"omp_no_parallelism")))
4912 ReachedUnknownParallelRegions.insert(&CB);
4916 if (!SPMDCompatibilityTracker.isAtFixpoint()) {
4917 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4918 SPMDCompatibilityTracker.insert(&CB);
4923 indicateOptimisticFixpoint();
4929 if (NumCallees > 1) {
4930 indicatePessimisticFixpoint();
4937 case OMPRTL___kmpc_is_spmd_exec_mode:
4938 case OMPRTL___kmpc_distribute_static_fini:
4939 case OMPRTL___kmpc_for_static_fini:
4940 case OMPRTL___kmpc_global_thread_num:
4941 case OMPRTL___kmpc_get_hardware_num_threads_in_block:
4942 case OMPRTL___kmpc_get_hardware_num_blocks:
4943 case OMPRTL___kmpc_single:
4944 case OMPRTL___kmpc_end_single:
4945 case OMPRTL___kmpc_master:
4946 case OMPRTL___kmpc_end_master:
4947 case OMPRTL___kmpc_barrier:
4948 case OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2:
4949 case OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2:
4950 case OMPRTL___kmpc_error:
4951 case OMPRTL___kmpc_flush:
4952 case OMPRTL___kmpc_get_hardware_thread_id_in_block:
4953 case OMPRTL___kmpc_get_warp_size:
4954 case OMPRTL_omp_get_thread_num:
4955 case OMPRTL_omp_get_num_threads:
4956 case OMPRTL_omp_get_max_threads:
4957 case OMPRTL_omp_in_parallel:
4958 case OMPRTL_omp_get_dynamic:
4959 case OMPRTL_omp_get_cancellation:
4960 case OMPRTL_omp_get_nested:
4961 case OMPRTL_omp_get_schedule:
4962 case OMPRTL_omp_get_thread_limit:
4963 case OMPRTL_omp_get_supported_active_levels:
4964 case OMPRTL_omp_get_max_active_levels:
4965 case OMPRTL_omp_get_level:
4966 case OMPRTL_omp_get_ancestor_thread_num:
4967 case OMPRTL_omp_get_team_size:
4968 case OMPRTL_omp_get_active_level:
4969 case OMPRTL_omp_in_final:
4970 case OMPRTL_omp_get_proc_bind:
4971 case OMPRTL_omp_get_num_places:
4972 case OMPRTL_omp_get_num_procs:
4973 case OMPRTL_omp_get_place_proc_ids:
4974 case OMPRTL_omp_get_place_num:
4975 case OMPRTL_omp_get_partition_num_places:
4976 case OMPRTL_omp_get_partition_place_nums:
4977 case OMPRTL_omp_get_wtime:
4979 case OMPRTL___kmpc_distribute_static_init_4:
4980 case OMPRTL___kmpc_distribute_static_init_4u:
4981 case OMPRTL___kmpc_distribute_static_init_8:
4982 case OMPRTL___kmpc_distribute_static_init_8u:
4983 case OMPRTL___kmpc_for_static_init_4:
4984 case OMPRTL___kmpc_for_static_init_4u:
4985 case OMPRTL___kmpc_for_static_init_8:
4986 case OMPRTL___kmpc_for_static_init_8u: {
4988 unsigned ScheduleArgOpNo = 2;
4989 auto *ScheduleTypeCI =
4991 unsigned ScheduleTypeVal =
4992 ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0;
4994 case OMPScheduleType::UnorderedStatic:
4995 case OMPScheduleType::UnorderedStaticChunked:
4996 case OMPScheduleType::OrderedDistribute:
4997 case OMPScheduleType::OrderedDistributeChunked:
5000 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5001 SPMDCompatibilityTracker.insert(&CB);
5005 case OMPRTL___kmpc_target_init:
5008 case OMPRTL___kmpc_target_deinit:
5009 KernelDeinitCB = &CB;
5011 case OMPRTL___kmpc_parallel_60:
5012 if (!handleParallel60(
A, CB))
5013 indicatePessimisticFixpoint();
5015 case OMPRTL___kmpc_omp_task:
5017 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5018 SPMDCompatibilityTracker.insert(&CB);
5019 ReachedUnknownParallelRegions.insert(&CB);
5021 case OMPRTL___kmpc_alloc_shared:
5022 case OMPRTL___kmpc_free_shared:
5028 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5029 SPMDCompatibilityTracker.insert(&CB);
5035 indicateOptimisticFixpoint();
5039 A.getAAFor<AACallEdges>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
5040 if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) {
5041 CheckCallee(getAssociatedFunction(), 1);
5044 const auto &OptimisticEdges = AACE->getOptimisticEdges();
5045 for (
auto *Callee : OptimisticEdges) {
5046 CheckCallee(Callee, OptimisticEdges.size());
5057 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
5058 KernelInfoState StateBefore = getState();
5060 auto CheckCallee = [&](
Function *
F,
int NumCallees) {
5061 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(
F);
5065 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
5068 A.getAAFor<AAKernelInfo>(*
this, FnPos, DepClassTy::REQUIRED);
5070 return indicatePessimisticFixpoint();
5071 if (getState() == FnAA->getState())
5072 return ChangeStatus::UNCHANGED;
5073 getState() = FnAA->getState();
5074 return ChangeStatus::CHANGED;
5077 return indicatePessimisticFixpoint();
5080 if (It->getSecond() == OMPRTL___kmpc_parallel_60) {
5081 if (!handleParallel60(
A, CB))
5082 return indicatePessimisticFixpoint();
5083 return StateBefore == getState() ? ChangeStatus::UNCHANGED
5084 : ChangeStatus::CHANGED;
5090 (It->getSecond() == OMPRTL___kmpc_alloc_shared ||
5091 It->getSecond() == OMPRTL___kmpc_free_shared) &&
5092 "Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call");
5094 auto *HeapToStackAA =
A.getAAFor<AAHeapToStack>(
5096 auto *HeapToSharedAA =
A.getAAFor<AAHeapToShared>(
5104 case OMPRTL___kmpc_alloc_shared:
5105 if ((!HeapToStackAA || !HeapToStackAA->isAssumedHeapToStack(CB)) &&
5106 (!HeapToSharedAA || !HeapToSharedAA->isAssumedHeapToShared(CB)))
5107 SPMDCompatibilityTracker.insert(&CB);
5109 case OMPRTL___kmpc_free_shared:
5110 if ((!HeapToStackAA ||
5111 !HeapToStackAA->isAssumedHeapToStackRemovedFree(CB)) &&
5113 !HeapToSharedAA->isAssumedHeapToSharedRemovedFree(CB)))
5114 SPMDCompatibilityTracker.insert(&CB);
5117 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5118 SPMDCompatibilityTracker.insert(&CB);
5120 return ChangeStatus::CHANGED;
5124 A.getAAFor<AACallEdges>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
5125 if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) {
5126 if (Function *
F = getAssociatedFunction())
5129 const auto &OptimisticEdges = AACE->getOptimisticEdges();
5130 for (
auto *Callee : OptimisticEdges) {
5131 CheckCallee(Callee, OptimisticEdges.size());
5137 return StateBefore == getState() ? ChangeStatus::UNCHANGED
5138 : ChangeStatus::CHANGED;
5143 bool handleParallel60(Attributor &
A, CallBase &CB) {
5144 const unsigned int NonWrapperFunctionArgNo = 5;
5145 const unsigned int WrapperFunctionArgNo = 6;
5146 auto ParallelRegionOpArgNo = SPMDCompatibilityTracker.isAssumed()
5147 ? NonWrapperFunctionArgNo
5148 : WrapperFunctionArgNo;
5152 if (!ParallelRegion)
5155 ReachedKnownParallelRegions.insert(&CB);
5157 auto *FnAA =
A.getAAFor<AAKernelInfo>(
5159 NestedParallelism |= !FnAA || !FnAA->getState().isValidState() ||
5160 !FnAA->ReachedKnownParallelRegions.empty() ||
5161 !FnAA->ReachedKnownParallelRegions.isValidState() ||
5162 !FnAA->ReachedUnknownParallelRegions.isValidState() ||
5163 !FnAA->ReachedUnknownParallelRegions.empty();
5168struct AAFoldRuntimeCall
5169 :
public StateWrapper<BooleanState, AbstractAttribute> {
5170 using Base = StateWrapper<BooleanState, AbstractAttribute>;
5172 AAFoldRuntimeCall(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
5175 void trackStatistics()
const override {}
5178 static AAFoldRuntimeCall &createForPosition(
const IRPosition &IRP,
5182 StringRef
getName()
const override {
return "AAFoldRuntimeCall"; }
5185 const char *getIdAddr()
const override {
return &
ID; }
5189 static bool classof(
const AbstractAttribute *AA) {
5193 static const char ID;
5196struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
5197 AAFoldRuntimeCallCallSiteReturned(
const IRPosition &IRP, Attributor &
A)
5198 : AAFoldRuntimeCall(IRP,
A) {}
5201 const std::string getAsStr(Attributor *)
const override {
5202 if (!isValidState())
5205 std::string Str(
"simplified value: ");
5207 if (!SimplifiedValue)
5208 return Str + std::string(
"none");
5210 if (!*SimplifiedValue)
5211 return Str + std::string(
"nullptr");
5214 return Str + std::to_string(CI->getSExtValue());
5216 return Str + std::string(
"unknown");
5221 indicatePessimisticFixpoint();
5225 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
5226 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
5227 assert(It != OMPInfoCache.RuntimeFunctionIDMap.end() &&
5228 "Expected a known OpenMP runtime function");
5230 RFKind = It->getSecond();
5233 A.registerSimplificationCallback(
5235 [&](
const IRPosition &IRP,
const AbstractAttribute *AA,
5236 bool &UsedAssumedInformation) -> std::optional<Value *> {
5237 assert((isValidState() || SimplifiedValue ==
nullptr) &&
5238 "Unexpected invalid state!");
5240 if (!isAtFixpoint()) {
5241 UsedAssumedInformation =
true;
5243 A.recordDependence(*
this, *AA, DepClassTy::OPTIONAL);
5245 return SimplifiedValue;
5252 case OMPRTL___kmpc_is_spmd_exec_mode:
5255 case OMPRTL___kmpc_parallel_level:
5258 case OMPRTL___kmpc_get_hardware_num_threads_in_block:
5259 Changed =
Changed | foldKernelFnAttribute(
A,
"omp_target_thread_limit");
5261 case OMPRTL___kmpc_get_hardware_num_blocks:
5274 if (SimplifiedValue && *SimplifiedValue) {
5277 A.deleteAfterManifest(
I);
5280 auto Remark = [&](OptimizationRemark
OR) {
5282 return OR <<
"Replacing OpenMP runtime call "
5284 <<
ore::NV(
"FoldedValue",
C->getZExtValue()) <<
".";
5285 return OR <<
"Replacing OpenMP runtime call "
5290 A.emitRemark<OptimizationRemark>(CB,
"OMP180",
Remark);
5293 << **SimplifiedValue <<
"\n");
5295 Changed = ChangeStatus::CHANGED;
5302 SimplifiedValue =
nullptr;
5303 return AAFoldRuntimeCall::indicatePessimisticFixpoint();
5309 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5311 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
5312 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
5313 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5316 if (!CallerKernelInfoAA ||
5317 !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5318 return indicatePessimisticFixpoint();
5320 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5322 DepClassTy::REQUIRED);
5324 if (!AA || !AA->isValidState()) {
5325 SimplifiedValue =
nullptr;
5326 return indicatePessimisticFixpoint();
5329 if (AA->SPMDCompatibilityTracker.isAssumed()) {
5330 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5335 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5336 ++KnownNonSPMDCount;
5338 ++AssumedNonSPMDCount;
5342 if ((AssumedSPMDCount + KnownSPMDCount) &&
5343 (AssumedNonSPMDCount + KnownNonSPMDCount))
5344 return indicatePessimisticFixpoint();
5346 auto &Ctx = getAnchorValue().getContext();
5347 if (KnownSPMDCount || AssumedSPMDCount) {
5348 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
5349 "Expected only SPMD kernels!");
5352 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx),
true);
5353 }
else if (KnownNonSPMDCount || AssumedNonSPMDCount) {
5354 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
5355 "Expected only non-SPMD kernels!");
5358 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx),
false);
5363 assert(!SimplifiedValue &&
"SimplifiedValue should be none");
5366 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5367 : ChangeStatus::CHANGED;
5372 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5374 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5377 if (!CallerKernelInfoAA ||
5378 !CallerKernelInfoAA->ParallelLevels.isValidState())
5379 return indicatePessimisticFixpoint();
5381 if (!CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5382 return indicatePessimisticFixpoint();
5384 if (CallerKernelInfoAA->ReachingKernelEntries.empty()) {
5385 assert(!SimplifiedValue &&
5386 "SimplifiedValue should keep none at this point");
5387 return ChangeStatus::UNCHANGED;
5390 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
5391 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
5392 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5394 DepClassTy::REQUIRED);
5395 if (!AA || !AA->SPMDCompatibilityTracker.isValidState())
5396 return indicatePessimisticFixpoint();
5398 if (AA->SPMDCompatibilityTracker.isAssumed()) {
5399 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5404 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5405 ++KnownNonSPMDCount;
5407 ++AssumedNonSPMDCount;
5411 if ((AssumedSPMDCount + KnownSPMDCount) &&
5412 (AssumedNonSPMDCount + KnownNonSPMDCount))
5413 return indicatePessimisticFixpoint();
5415 auto &Ctx = getAnchorValue().getContext();
5419 if (AssumedSPMDCount || KnownSPMDCount) {
5420 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
5421 "Expected only SPMD kernels!");
5422 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 1);
5424 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
5425 "Expected only non-SPMD kernels!");
5426 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 0);
5428 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5429 : ChangeStatus::CHANGED;
5432 ChangeStatus foldKernelFnAttribute(Attributor &
A, llvm::StringRef Attr) {
5434 int32_t CurrentAttrValue = -1;
5435 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5437 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5440 if (!CallerKernelInfoAA ||
5441 !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5442 return indicatePessimisticFixpoint();
5445 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5446 int32_t NextAttrVal =
K->getFnAttributeAsParsedInteger(Attr, -1);
5448 if (NextAttrVal == -1 ||
5449 (CurrentAttrValue != -1 && CurrentAttrValue != NextAttrVal))
5450 return indicatePessimisticFixpoint();
5451 CurrentAttrValue = NextAttrVal;
5454 if (CurrentAttrValue != -1) {
5455 auto &Ctx = getAnchorValue().getContext();
5457 ConstantInt::get(Type::getInt32Ty(Ctx), CurrentAttrValue);
5459 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5460 : ChangeStatus::CHANGED;
5466 std::optional<Value *> SimplifiedValue;
5476 auto &RFI = OMPInfoCache.RFIs[RF];
5477 RFI.foreachUse(SCC, [&](Use &U, Function &
F) {
5478 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &RFI);
5481 A.getOrCreateAAFor<AAFoldRuntimeCall>(
5483 DepClassTy::NONE,
false,
5489void OpenMPOpt::registerAAs(
bool IsModulePass) {
5499 A.getOrCreateAAFor<AAKernelInfo>(
5501 DepClassTy::NONE,
false,
5505 OMPInformationCache::RuntimeFunctionInfo &InitRFI =
5506 OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
5507 InitRFI.foreachUse(SCC, CreateKernelInfoCB);
5509 registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode);
5510 registerFoldRuntimeCall(OMPRTL___kmpc_parallel_level);
5511 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_threads_in_block);
5512 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_blocks);
5517 for (
int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) {
5520 auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
5523 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
5530 A.getOrCreateAAFor<AAICVTracker>(CBPos);
5534 GetterRFI.foreachUse(SCC, CreateAA);
5543 for (
auto *
F : SCC) {
5544 if (
F->isDeclaration())
5550 if (
F->hasLocalLinkage()) {
5552 const auto *CB = dyn_cast<CallBase>(U.getUser());
5553 return CB && CB->isCallee(&U) &&
5554 A.isRunOn(const_cast<Function *>(CB->getCaller()));
5558 registerAAsForFunction(
A, *
F);
5562void OpenMPOpt::registerAAsForFunction(Attributor &
A,
const Function &
F) {
5568 if (
F.hasFnAttribute(Attribute::Convergent))
5573 bool UsedAssumedInformation =
false;
5576 A.getOrCreateAAFor<AAAddressSpace>(
5582 A.getOrCreateAAFor<AAIndirectCallInfo>(
5587 A.getOrCreateAAFor<AAAddressSpace>(
5596 if (
II->getIntrinsicID() == Intrinsic::assume) {
5597 A.getOrCreateAAFor<AAPotentialValues>(
5605const char AAICVTracker::ID = 0;
5606const char AAKernelInfo::ID = 0;
5608const char AAHeapToShared::ID = 0;
5609const char AAFoldRuntimeCall::ID = 0;
5611AAICVTracker &AAICVTracker::createForPosition(
const IRPosition &IRP,
5613 AAICVTracker *AA =
nullptr;
5621 AA =
new (
A.Allocator) AAICVTrackerFunctionReturned(IRP,
A);
5624 AA =
new (
A.Allocator) AAICVTrackerCallSiteReturned(IRP,
A);
5627 AA =
new (
A.Allocator) AAICVTrackerCallSite(IRP,
A);
5630 AA =
new (
A.Allocator) AAICVTrackerFunction(IRP,
A);
5639 AAExecutionDomainFunction *
AA =
nullptr;
5649 "AAExecutionDomain can only be created for function position!");
5651 AA =
new (
A.Allocator) AAExecutionDomainFunction(IRP,
A);
5658AAHeapToShared &AAHeapToShared::createForPosition(
const IRPosition &IRP,
5660 AAHeapToSharedFunction *
AA =
nullptr;
5670 "AAHeapToShared can only be created for function position!");
5672 AA =
new (
A.Allocator) AAHeapToSharedFunction(IRP,
A);
5679AAKernelInfo &AAKernelInfo::createForPosition(
const IRPosition &IRP,
5681 AAKernelInfo *AA =
nullptr;
5691 AA =
new (
A.Allocator) AAKernelInfoCallSite(IRP,
A);
5694 AA =
new (
A.Allocator) AAKernelInfoFunction(IRP,
A);
5701AAFoldRuntimeCall &AAFoldRuntimeCall::createForPosition(
const IRPosition &IRP,
5703 AAFoldRuntimeCall *AA =
nullptr;
5712 llvm_unreachable(
"KernelInfo can only be created for call site position!");
5714 AA =
new (
A.Allocator) AAFoldRuntimeCallCallSiteReturned(IRP,
A);
5735 if (Kernels.contains(&
F))
5737 return !
F.use_empty();
5744 return ORA <<
"Could not internalize function. "
5745 <<
"Some optimizations may not be possible. [OMP140]";
5757 if (!
F.isDeclaration() && !Kernels.contains(&
F) && IsCalled(
F) &&
5761 }
else if (!
F.hasLocalLinkage() && !
F.hasFnAttribute(Attribute::Cold)) {
5774 if (!
F.isDeclaration() && !InternalizedMap.
lookup(&
F)) {
5776 Functions.insert(&
F);
5794 OMPInformationCache InfoCache(M, AG, Allocator,
nullptr, PostLink);
5796 unsigned MaxFixpointIterations =
5808 return F.hasFnAttribute(
"kernel");
5813 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache,
A);
5819 if (!
F.isDeclaration() && !Kernels.contains(&
F) &&
5820 !
F.hasFnAttribute(Attribute::NoInline))
5821 F.addFnAttr(Attribute::AlwaysInline);
5851 Module &M = *
C.begin()->getFunction().getParent();
5873 OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
5874 &Functions, PostLink);
5876 unsigned MaxFixpointIterations =
5890 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache,
A);
5891 bool Changed = OMPOpt.run(
false);
5910 if (
F.hasKernelCallingConv()) {
5915 ++NumOpenMPTargetRegionKernels;
5918 ++NumNonOpenMPTargetRegionKernels;
5925 Metadata *MD = M.getModuleFlag(
"openmp");
5933 Metadata *MD = M.getModuleFlag(
"openmp-device");
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Expand Atomic instructions
static cl::opt< unsigned > SetFixpointIterations("attributor-max-iterations", cl::Hidden, cl::desc("Maximal number of fixpoint iterations."), cl::init(32))
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file provides interfaces used to manipulate a call graph, regardless if it is a "old style" Call...
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
dxil pretty DXIL Metadata Pretty Printer
This file defines the DenseSet and SmallDenseSet classes.
This file defines an array type that can be indexed using scoped enum values.
static void emitRemark(const Function &F, OptimizationRemarkEmitter &ORE, bool Skip)
Loop::LoopBounds::Direction Direction
Machine Check Debug Module
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
This file defines constans and helpers used when dealing with OpenMP.
This file defines constans that will be used by both host and device compilation.
static constexpr auto TAG
static cl::opt< bool > HideMemoryTransferLatency("openmp-hide-memory-transfer-latency", cl::desc("[WIP] Tries to hide the latency of host to device memory" " transfers"), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptStateMachineRewrite("openmp-opt-disable-state-machine-rewrite", cl::desc("Disable OpenMP optimizations that replace the state machine."), cl::Hidden, cl::init(false))
static cl::opt< bool > EnableParallelRegionMerging("openmp-opt-enable-merging", cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintModuleAfterOptimizations("openmp-opt-print-module-after", cl::desc("Print the current module after OpenMP optimizations."), cl::Hidden, cl::init(false))
#define KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MEMBER)
#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX)
#define KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MEMBER)
static cl::opt< bool > PrintOpenMPKernels("openmp-print-gpu-kernels", cl::init(false), cl::Hidden)
static cl::opt< bool > DisableOpenMPOptFolding("openmp-opt-disable-folding", cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintModuleBeforeOptimizations("openmp-opt-print-module-before", cl::desc("Print the current module before OpenMP optimizations."), cl::Hidden, cl::init(false))
static cl::opt< unsigned > SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden, cl::desc("Maximal number of attributor iterations."), cl::init(256))
static cl::opt< bool > DisableInternalization("openmp-opt-disable-internalization", cl::desc("Disable function internalization."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintICVValues("openmp-print-icv-values", cl::init(false), cl::Hidden)
static cl::opt< bool > DisableOpenMPOptimizations("openmp-opt-disable", cl::desc("Disable OpenMP specific optimizations."), cl::Hidden, cl::init(false))
static cl::opt< unsigned > SharedMemoryLimit("openmp-opt-shared-limit", cl::Hidden, cl::desc("Maximum amount of shared memory to use."), cl::init(std::numeric_limits< unsigned >::max()))
static cl::opt< bool > EnableVerboseRemarks("openmp-opt-verbose-remarks", cl::desc("Enables more verbose remarks."), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptDeglobalization("openmp-opt-disable-deglobalization", cl::desc("Disable OpenMP optimizations involving deglobalization."), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptBarrierElimination("openmp-opt-disable-barrier-elimination", cl::desc("Disable OpenMP optimizations that eliminate barriers."), cl::Hidden, cl::init(false))
static cl::opt< bool > DeduceICVValues("openmp-deduce-icv-values", cl::init(false), cl::Hidden)
#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX)
#define KERNEL_ENVIRONMENT_GETTER(MEMBER, RETURNTYPE)
static cl::opt< bool > DisableOpenMPOptSPMDization("openmp-opt-disable-spmdization", cl::desc("Disable OpenMP optimizations involving SPMD-ization."), cl::Hidden, cl::init(false))
static cl::opt< bool > AlwaysInlineDeviceFunctions("openmp-opt-inline-device", cl::desc("Inline all applicable functions on the device."), cl::Hidden, cl::init(false))
FunctionAnalysisManager FAM
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static StringRef getName(Value *V)
std::pair< BasicBlock *, BasicBlock * > Edge
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static const int BlockSize
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, const llvm::StringTable &StandardNames, VectorLibrary VecLib)
Initialize the set of available library functions based on the specified target triple.
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
size_t size() const
size - Get the array size.
iterator begin()
Instruction iterator methods.
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
reverse_iterator rbegin()
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
InstListType::reverse_iterator reverse_iterator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
void setCallingConv(CallingConv::ID CC)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool doesNotAccessMemory(unsigned OpNo) const
LLVM_ABI bool isIndirectCall() const
Return true if the callsite is an indirect call.
bool isCallee(Value::const_user_iterator UI) const
Determine whether the passed iterator points to the callee operand's Use.
Value * getArgOperand(unsigned i) const
void setArgOperand(unsigned i, Value *v)
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
unsigned arg_size() const
AttributeList getAttributes() const
Return the attributes for this call.
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool isArgOperand(const Use *U) const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
LLVM_ABI Function * getCaller()
Helper to get the caller (the parent function).
Wrapper to unify "old style" CallGraph and "new style" LazyCallGraph.
void initialize(LazyCallGraph &LCG, LazyCallGraph::SCC &SCC, CGSCCAnalysisManager &AM, CGSCCUpdateResult &UR)
Initializers for usage outside of a CGSCC pass, inside a CGSCC pass in the old and new pass manager (...
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
@ ICMP_SLT
signed less than
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
This is the shared class of boolean and integer constants.
IntegerType * getIntegerType() const
Variant of the getType() method to always return an IntegerType, which reduces the amount of casting ...
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
This is an important base class in LLVM.
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
LLVM_ABI Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const
Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...
static ErrorSuccess success()
Create a success value.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this fence instruction.
A proxy from a FunctionAnalysisManager to an SCC.
const BasicBlock & getEntryBlock() const
const BasicBlock & front() const
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Argument * getArg(unsigned i) const
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
bool hasLocalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
@ PrivateLinkage
Like Internal, but omit from symbol table.
@ InternalLinkage
Rename collisions when linking (static functions).
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
LLVM_ABI void setInitializer(Constant *InitVal)
setInitializer - Sets the initializer for this global variable, removing any existing initializer if ...
BasicBlock * getBlock() const
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateIsNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg == 0.
LLVM_ABI bool isLifetimeStartOrEnd() const LLVM_READONLY
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
LLVM_ABI bool mayReadFromMemory() const LLVM_READONLY
Return true if this instruction may read memory.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void setSuccessor(unsigned Idx, BasicBlock *BB)
Update the specified successor to point at the provided block.
A node in the call graph.
An SCC of the call graph.
A lazily constructed view of the call graph of a module.
LLVM_ABI void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
LLVM_ABI StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
A Module instance is used to store all the information related to an LLVM module.
const Triple & getTargetTriple() const
Get the target triple which is a string describing the target host.
LLVM_ABI Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
LLVM_ABI FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
static LLVM_ABI std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
LLVM_ABI Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
static LLVM_ABI std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM, LazyCallGraph &CG, CGSCCUpdateResult &UR)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
static ReturnInst * Create(LLVMContext &C, Value *retVal=nullptr, InsertPosition InsertBefore=nullptr)
A vector that has set insertion semantics.
size_type size() const
Determine the number of elements in the SetVector.
size_type count(const_arg_type key) const
Count the number of elements of a given key in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Triple - Helper class for working with autoconf configuration names.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Type * getType() const
All values are typed, get the type of this value.
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
GlobalVariable * getKernelEnvironementGVFromKernelInitCB(CallBase *KernelInitCB)
ConstantStruct * getKernelEnvironementFromKernelInitCB(CallBase *KernelInitCB)
Abstract Attribute helper functions.
LLVM_ABI bool isValidAtPosition(const ValueAndContext &VAC, InformationCache &InfoCache)
Return true if the value of VAC is a valid at the position of VAC, that is a constant,...
LLVM_ABI bool isPotentiallyAffectedByBarrier(Attributor &A, const Instruction &I, const AbstractAttribute &QueryingAA)
Return true if I is potentially affected by a barrier.
LLVM_ABI bool isNoSyncInst(Attributor &A, const Instruction &I, const AbstractAttribute &QueryingAA)
Return true if I is a nosync instruction.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
E & operator^=(E &LHS, E RHS)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ BasicBlock
Various leaf nodes.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
constexpr uint64_t PointerSize
aarch64 pointer size.
bool isOpenMPDevice(Module &M)
Helper to determine if M is a OpenMP target offloading device module.
bool containsOpenMP(Module &M)
Helper to determine if M contains OpenMP.
InternalControlVar
IDs for all Internal Control Variables (ICVs).
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
KernelSet getDeviceKernels(Module &M)
Get OpenMP device kernels in M.
@ OMP_TGT_EXEC_MODE_GENERIC_SPMD
@ OMP_TGT_EXEC_MODE_GENERIC
SetVector< Kernel > KernelSet
Set of kernels in the module.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
bool isOpenMPKernel(Function &Fn)
Return true iff Fn is an OpenMP GPU kernel; Fn has the "kernel" attribute.
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< UseNode * > Use
friend class Instruction
Iterator for Instructions in a `BasicBlock.
LLVM_ABI iterator begin() const
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool succ_empty(const Instruction *I)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
bool operator!=(uint64_t V1, const APInt &V2)
constexpr from_range_t from_range
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
AnalysisManager< LazyCallGraph::SCC, LazyCallGraph & > CGSCCAnalysisManager
The CGSCC analysis manager.
@ ThinLTOPostLink
ThinLTO postlink (backend compile) phase.
@ FullLTOPostLink
Full LTO postlink (backend compile) phase.
@ ThinLTOPreLink
ThinLTO prelink (summary) phase.
auto dyn_cast_or_null(const Y &Val)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
ArrayRef(const T &OneElt) -> ArrayRef< T >
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto predecessors(const MachineBasicBlock *BB)
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
Attempt to constant fold an insertvalue instruction with the specified operands and indices.
@ OPTIONAL
The target may be valid if the source is not.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
BumpPtrAllocatorImpl<> BumpPtrAllocator
The standard BumpPtrAllocator which just uses the default template parameters.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
static LLVM_ABI AAExecutionDomain & createForPosition(const IRPosition &IRP, Attributor &A)
Create an abstract attribute view for the position IRP.
AAExecutionDomain(const IRPosition &IRP, Attributor &A)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
AccessKind
Simple enum to distinguish read/write/read-write accesses.
StateType::base_t MemoryLocationsKind
static LLVM_ABI bool isAlignedBarrier(const CallBase &CB, bool ExecutedAligned)
Helper function to determine if CB is an aligned (GPU) barrier.
Base struct for all "concrete attribute" deductions.
virtual const char * getIdAddr() const =0
This function should return the address of the ID of the AbstractAttribute.
An interface to query the internal state of an abstract attribute.
Wrapper for FunctionAnalysisManager.
Configuration for the Attributor.
std::function< void(Attributor &A, const Function &F)> InitializationCallback
Callback function to be invoked on internal functions marked live.
std::optional< unsigned > MaxFixpointIterations
Maximum number of iterations to run until fixpoint.
bool RewriteSignatures
Flag to determine if we rewrite function signatures.
OptimizationRemarkGetter OREGetter
IPOAmendableCBTy IPOAmendableCB
bool IsModulePass
Is the user of the Attributor a module pass or not.
bool DefaultInitializeLiveInternals
Flag to determine if we want to initialize all default AAs for an internal function marked live.
The fixpoint analysis framework that orchestrates the attribute deduction.
static LLVM_ABI bool isInternalizable(Function &F)
Returns true if the function F can be internalized.
std::function< std::optional< Value * >( const IRPosition &, const AbstractAttribute *, bool &)> SimplifictionCallbackTy
Register CB as a simplification callback.
std::function< std::optional< Constant * >( const GlobalVariable &, const AbstractAttribute *, bool &)> GlobalVariableSimplifictionCallbackTy
Register CB as a simplification callback.
std::function< bool(Attributor &, const AbstractAttribute *)> VirtualUseCallbackTy
static LLVM_ABI bool internalizeFunctions(SmallPtrSetImpl< Function * > &FnSet, DenseMap< Function *, Function * > &FnMap)
Make copies of each function in the set FnSet such that the copied version has internal linkage after...
Simple wrapper for a single bit (boolean) state.
Support structure for SCC passes to communicate updates the call graph back to the CGSCC pass manager...
Helper to describe and deal with positions in the LLVM-IR.
static const IRPosition callsite_returned(const CallBase &CB)
Create a position describing the returned value of CB.
static const IRPosition returned(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the returned value of F.
static const IRPosition value(const Value &V, const CallBaseContext *CBContext=nullptr)
Create a position describing the value of V.
static const IRPosition inst(const Instruction &I, const CallBaseContext *CBContext=nullptr)
Create a position describing the instruction I.
@ IRP_ARGUMENT
An attribute for a function argument.
@ IRP_RETURNED
An attribute for the function return value.
@ IRP_CALL_SITE
An attribute for a call site (function scope).
@ IRP_CALL_SITE_RETURNED
An attribute for a call site return value.
@ IRP_FUNCTION
An attribute for a function (scope).
@ IRP_FLOAT
A position that is not associated with a spot suitable for attributes.
@ IRP_CALL_SITE_ARGUMENT
An attribute for a call site argument.
@ IRP_INVALID
An invalid position.
static const IRPosition function(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the function scope of F.
Kind getPositionKind() const
Return the associated position kind.
static const IRPosition callsite_function(const CallBase &CB)
Create a position describing the function scope of CB.
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...