50#include "llvm/IR/IntrinsicsAMDGPU.h"
51#include "llvm/IR/IntrinsicsNVPTX.h"
67#define DEBUG_TYPE "openmp-opt"
70 "openmp-opt-disable",
cl::desc(
"Disable OpenMP specific optimizations."),
74 "openmp-opt-enable-merging",
80 cl::desc(
"Disable function internalization."),
91 "openmp-hide-memory-transfer-latency",
92 cl::desc(
"[WIP] Tries to hide the latency of host to device memory"
97 "openmp-opt-disable-deglobalization",
98 cl::desc(
"Disable OpenMP optimizations involving deglobalization."),
102 "openmp-opt-disable-spmdization",
103 cl::desc(
"Disable OpenMP optimizations involving SPMD-ization."),
107 "openmp-opt-disable-folding",
112 "openmp-opt-disable-state-machine-rewrite",
113 cl::desc(
"Disable OpenMP optimizations that replace the state machine."),
117 "openmp-opt-disable-barrier-elimination",
118 cl::desc(
"Disable OpenMP optimizations that eliminate barriers."),
122 "openmp-opt-print-module-after",
123 cl::desc(
"Print the current module after OpenMP optimizations."),
127 "openmp-opt-print-module-before",
128 cl::desc(
"Print the current module before OpenMP optimizations."),
132 "openmp-opt-inline-device",
143 cl::desc(
"Maximal number of attributor iterations."),
148 cl::desc(
"Maximum amount of shared memory to use."),
149 cl::init(std::numeric_limits<unsigned>::max()));
152 "Number of OpenMP runtime calls deduplicated");
154 "Number of OpenMP parallel regions deleted");
156 "Number of OpenMP runtime functions identified");
158 "Number of OpenMP runtime function uses identified");
160 "Number of OpenMP target region entry points (=kernels) identified");
162 "Number of non-OpenMP target region kernels identified");
164 "Number of OpenMP target region entry points (=kernels) executed in "
165 "SPMD-mode instead of generic-mode");
166STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine,
167 "Number of OpenMP target region entry points (=kernels) executed in "
168 "generic-mode without a state machines");
169STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback,
170 "Number of OpenMP target region entry points (=kernels) executed in "
171 "generic-mode with customized state machines with fallback");
172STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback,
173 "Number of OpenMP target region entry points (=kernels) executed in "
174 "generic-mode with customized state machines without fallback");
176 NumOpenMPParallelRegionsReplacedInGPUStateMachine,
177 "Number of OpenMP parallel regions replaced with ID in GPU state machines");
179 "Number of OpenMP parallel regions merged");
181 "Amount of memory pushed to shared memory");
182STATISTIC(NumBarriersEliminated,
"Number of redundant barriers eliminated");
210#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX) \
211 constexpr unsigned MEMBER##Idx = IDX;
216#undef KERNEL_ENVIRONMENT_IDX
218#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX) \
219 constexpr unsigned MEMBER##Idx = IDX;
229#undef KERNEL_ENVIRONMENT_CONFIGURATION_IDX
231#define KERNEL_ENVIRONMENT_GETTER(MEMBER, RETURNTYPE) \
232 RETURNTYPE *get##MEMBER##FromKernelEnvironment(ConstantStruct *KernelEnvC) { \
233 return cast<RETURNTYPE>(KernelEnvC->getAggregateElement(MEMBER##Idx)); \
239#undef KERNEL_ENVIRONMENT_GETTER
241#define KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MEMBER) \
242 ConstantInt *get##MEMBER##FromKernelEnvironment( \
243 ConstantStruct *KernelEnvC) { \
244 ConstantStruct *ConfigC = \
245 getConfigurationFromKernelEnvironment(KernelEnvC); \
246 return dyn_cast<ConstantInt>(ConfigC->getAggregateElement(MEMBER##Idx)); \
257#undef KERNEL_ENVIRONMENT_CONFIGURATION_GETTER
261 constexpr int InitKernelEnvironmentArgNo = 0;
276struct AAHeapToShared;
283 OMPInformationCache(
Module &M, AnalysisGetter &AG,
287 OpenMPPostLink(OpenMPPostLink) {
290 const Triple
T(OMPBuilder.M.getTargetTriple());
291 switch (
T.getArch()) {
295 assert(OMPBuilder.Config.IsTargetDevice &&
296 "OpenMP AMDGPU/NVPTX is only prepared to deal with device code.");
297 OMPBuilder.Config.IsGPU =
true;
300 OMPBuilder.Config.IsGPU =
false;
303 OMPBuilder.initialize();
304 initializeRuntimeFunctions(M);
305 initializeInternalControlVars();
309 struct InternalControlVarInfo {
317 StringRef EnvVarName;
323 ConstantInt *InitValue;
336 struct RuntimeFunctionInfo {
357 using UseVector = SmallVector<Use *, 16>;
360 void clearUsesMap() { UsesMap.clear(); }
363 operator bool()
const {
return Declaration; }
366 UseVector &getOrCreateUseVector(Function *
F) {
367 std::shared_ptr<UseVector> &UV = UsesMap[
F];
369 UV = std::make_shared<UseVector>();
375 const UseVector *getUseVector(Function &
F)
const {
376 auto I = UsesMap.find(&
F);
377 if (
I != UsesMap.end())
378 return I->second.get();
383 size_t getNumFunctionsWithUses()
const {
return UsesMap.size(); }
387 size_t getNumArgs()
const {
return ArgumentTypes.size(); }
392 void foreachUse(SmallVectorImpl<Function *> &SCC,
393 function_ref<
bool(Use &, Function &)> CB) {
394 for (Function *
F : SCC)
400 void foreachUse(function_ref<
bool(Use &, Function &)> CB, Function *
F) {
401 SmallVector<unsigned, 8> ToBeDeleted;
405 UseVector &UV = getOrCreateUseVector(
F);
415 while (!ToBeDeleted.
empty()) {
425 DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap;
429 decltype(UsesMap)::iterator
begin() {
return UsesMap.begin(); }
430 decltype(UsesMap)::iterator
end() {
return UsesMap.end(); }
434 OpenMPIRBuilder OMPBuilder;
438 RuntimeFunction::OMPRTL___last>
442 DenseMap<Function *, RuntimeFunction> RuntimeFunctionIDMap;
446 InternalControlVar::ICV___last>
451 void initializeInternalControlVars() {
452#define ICV_RT_SET(_Name, RTL) \
454 auto &ICV = ICVs[_Name]; \
457#define ICV_RT_GET(Name, RTL) \
459 auto &ICV = ICVs[Name]; \
462#define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \
464 auto &ICV = ICVs[Enum]; \
467 ICV.InitKind = Init; \
468 ICV.EnvVarName = _EnvVarName; \
469 switch (ICV.InitKind) { \
470 case ICV_IMPLEMENTATION_DEFINED: \
471 ICV.InitValue = nullptr; \
474 ICV.InitValue = ConstantInt::get( \
475 Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \
478 ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \
484#include "llvm/Frontend/OpenMP/OMPKinds.def"
490 static bool declMatchesRTFTypes(Function *
F,
Type *RTFRetType,
497 if (
F->getReturnType() != RTFRetType)
499 if (
F->arg_size() != RTFArgTypes.
size())
502 auto *RTFTyIt = RTFArgTypes.
begin();
503 for (Argument &Arg :
F->args()) {
504 if (Arg.getType() != *RTFTyIt)
514 unsigned collectUses(RuntimeFunctionInfo &RFI,
bool CollectStats =
true) {
515 unsigned NumUses = 0;
516 if (!RFI.Declaration)
518 OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration);
521 NumOpenMPRuntimeFunctionsIdentified += 1;
522 NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
526 for (Use &U : RFI.Declaration->uses()) {
528 if (!
CGSCC ||
CGSCC->empty() ||
CGSCC->contains(UserI->getFunction())) {
529 RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
533 RFI.getOrCreateUseVector(
nullptr).push_back(&U);
542 auto &RFI = RFIs[RTF];
544 collectUses(RFI,
false);
548 void recollectUses() {
549 for (
int Idx = 0; Idx < RFIs.size(); ++Idx)
554 void setCallingConvention(FunctionCallee Callee, CallInst *CI) {
569 RuntimeFunctionInfo &RFI = RFIs[Fn];
571 if (!RFI.Declaration || RFI.Declaration->isDeclaration())
579 void initializeRuntimeFunctions(
Module &M) {
582#define OMP_TYPE(VarName, ...) \
583 Type *VarName = OMPBuilder.VarName; \
586#define OMP_ARRAY_TYPE(VarName, ...) \
587 ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \
589 PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \
590 (void)VarName##PtrTy;
592#define OMP_FUNCTION_TYPE(VarName, ...) \
593 FunctionType *VarName = OMPBuilder.VarName; \
595 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
598#define OMP_STRUCT_TYPE(VarName, ...) \
599 StructType *VarName = OMPBuilder.VarName; \
601 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
604#define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \
606 SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \
607 Function *F = M.getFunction(_Name); \
608 RTLFunctions.insert(F); \
609 if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \
610 RuntimeFunctionIDMap[F] = _Enum; \
611 auto &RFI = RFIs[_Enum]; \
614 RFI.IsVarArg = _IsVarArg; \
615 RFI.ReturnType = OMPBuilder._ReturnType; \
616 RFI.ArgumentTypes = std::move(ArgsTypes); \
617 RFI.Declaration = F; \
618 unsigned NumUses = collectUses(RFI); \
621 dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \
623 if (RFI.Declaration) \
624 dbgs() << TAG << "-> got " << NumUses << " uses in " \
625 << RFI.getNumFunctionsWithUses() \
626 << " different functions.\n"; \
630#include "llvm/Frontend/OpenMP/OMPKinds.def"
635 for (Function &
F : M) {
636 for (StringRef Prefix : {
"__kmpc",
"_ZN4ompx",
"omp_"})
637 if (
F.hasFnAttribute(Attribute::NoInline) &&
638 F.getName().starts_with(Prefix) &&
639 !
F.hasFnAttribute(Attribute::OptimizeNone))
640 F.removeFnAttr(Attribute::NoInline);
648 DenseSet<const Function *> RTLFunctions;
651 bool OpenMPPostLink =
false;
654template <
typename Ty,
bool InsertInval
idates = true>
656 bool contains(
const Ty &Elem)
const {
return Set.contains(Elem); }
657 bool insert(
const Ty &Elem) {
658 if (InsertInvalidates)
659 BooleanState::indicatePessimisticFixpoint();
660 return Set.insert(Elem);
663 const Ty &operator[](
int Idx)
const {
return Set[Idx]; }
664 bool operator==(
const BooleanStateWithSetVector &
RHS)
const {
665 return BooleanState::operator==(
RHS) && Set ==
RHS.Set;
667 bool operator!=(
const BooleanStateWithSetVector &
RHS)
const {
668 return !(*
this ==
RHS);
671 bool empty()
const {
return Set.empty(); }
672 size_t size()
const {
return Set.size(); }
675 BooleanStateWithSetVector &
operator^=(
const BooleanStateWithSetVector &
RHS) {
676 BooleanState::operator^=(
RHS);
677 Set.insert_range(
RHS.Set);
686 typename decltype(Set)::iterator
begin() {
return Set.begin(); }
687 typename decltype(Set)::iterator
end() {
return Set.end(); }
688 typename decltype(Set)::const_iterator
begin()
const {
return Set.begin(); }
689 typename decltype(Set)::const_iterator
end()
const {
return Set.end(); }
692template <
typename Ty,
bool InsertInval
idates = true>
693using BooleanStateWithPtrSetVector =
694 BooleanStateWithSetVector<Ty *, InsertInvalidates>;
698 bool IsAtFixpoint =
false;
702 BooleanStateWithPtrSetVector<CallBase,
false>
703 ReachedKnownParallelRegions;
706 BooleanStateWithPtrSetVector<CallBase> ReachedUnknownParallelRegions;
711 BooleanStateWithPtrSetVector<Instruction, false> SPMDCompatibilityTracker;
715 CallBase *KernelInitCB =
nullptr;
719 ConstantStruct *KernelEnvC =
nullptr;
723 CallBase *KernelDeinitCB =
nullptr;
726 bool IsKernelEntry =
false;
729 BooleanStateWithPtrSetVector<Function, false> ReachingKernelEntries;
734 BooleanStateWithSetVector<uint8_t> ParallelLevels;
737 bool NestedParallelism =
false;
742 KernelInfoState() =
default;
743 KernelInfoState(
bool BestState) {
745 indicatePessimisticFixpoint();
749 bool isValidState()
const override {
return true; }
752 bool isAtFixpoint()
const override {
return IsAtFixpoint; }
757 ParallelLevels.indicatePessimisticFixpoint();
758 ReachingKernelEntries.indicatePessimisticFixpoint();
759 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
760 ReachedKnownParallelRegions.indicatePessimisticFixpoint();
761 ReachedUnknownParallelRegions.indicatePessimisticFixpoint();
762 NestedParallelism =
true;
763 return ChangeStatus::CHANGED;
769 ParallelLevels.indicateOptimisticFixpoint();
770 ReachingKernelEntries.indicateOptimisticFixpoint();
771 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
772 ReachedKnownParallelRegions.indicateOptimisticFixpoint();
773 ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
774 return ChangeStatus::UNCHANGED;
778 KernelInfoState &getAssumed() {
return *
this; }
779 const KernelInfoState &getAssumed()
const {
return *
this; }
782 if (SPMDCompatibilityTracker !=
RHS.SPMDCompatibilityTracker)
784 if (ReachedKnownParallelRegions !=
RHS.ReachedKnownParallelRegions)
786 if (ReachedUnknownParallelRegions !=
RHS.ReachedUnknownParallelRegions)
788 if (ReachingKernelEntries !=
RHS.ReachingKernelEntries)
790 if (ParallelLevels !=
RHS.ParallelLevels)
792 if (NestedParallelism !=
RHS.NestedParallelism)
798 bool mayContainParallelRegion() {
799 return !ReachedKnownParallelRegions.empty() ||
800 !ReachedUnknownParallelRegions.empty();
804 static KernelInfoState getBestState() {
return KernelInfoState(
true); }
806 static KernelInfoState getBestState(KernelInfoState &KIS) {
807 return getBestState();
811 static KernelInfoState getWorstState() {
return KernelInfoState(
false); }
814 KernelInfoState
operator^=(
const KernelInfoState &KIS) {
816 if (KIS.KernelInitCB) {
817 if (KernelInitCB && KernelInitCB != KIS.KernelInitCB)
820 KernelInitCB = KIS.KernelInitCB;
822 if (KIS.KernelDeinitCB) {
823 if (KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB)
826 KernelDeinitCB = KIS.KernelDeinitCB;
828 if (KIS.KernelEnvC) {
829 if (KernelEnvC && KernelEnvC != KIS.KernelEnvC)
832 KernelEnvC = KIS.KernelEnvC;
834 SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;
835 ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;
836 ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;
837 NestedParallelism |= KIS.NestedParallelism;
841 KernelInfoState
operator&=(
const KernelInfoState &KIS) {
842 return (*
this ^= KIS);
852 AllocaInst *Array =
nullptr;
854 SmallVector<Value *, 8> StoredValues;
856 SmallVector<StoreInst *, 8> LastAccesses;
858 OffloadArray() =
default;
864 bool initialize(AllocaInst &Array, Instruction &Before) {
865 if (!getValues(Array, Before))
868 this->Array = &Array;
872 static const unsigned DeviceIDArgNum = 1;
873 static const unsigned BasePtrsArgNum = 3;
874 static const unsigned PtrsArgNum = 4;
875 static const unsigned SizesArgNum = 5;
881 bool getValues(AllocaInst &Array, Instruction &Before) {
883 const DataLayout &
DL = Array.getDataLayout();
884 std::optional<TypeSize> ArraySize = Array.getAllocationSize(
DL);
885 if (!ArraySize || !ArraySize->isFixed())
888 const uint64_t NumValues = ArraySize->getFixedValue() /
PointerSize;
889 StoredValues.assign(NumValues,
nullptr);
890 LastAccesses.assign(NumValues,
nullptr);
898 for (Instruction &
I : *BB) {
912 if ((uint64_t)Idx < NumValues) {
914 LastAccesses[Idx] = S;
925 const unsigned NumValues = StoredValues.size();
926 for (
unsigned I = 0;
I < NumValues; ++
I) {
927 if (!StoredValues[
I] || !LastAccesses[
I])
937 using OptimizationRemarkGetter =
938 function_ref<OptimizationRemarkEmitter &(
Function *)>;
940 OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater,
941 OptimizationRemarkGetter OREGetter,
942 OMPInformationCache &OMPInfoCache, Attributor &A)
943 : M(*(*SCC.
begin())->
getParent()), SCC(SCC), CGUpdater(CGUpdater),
944 OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {}
947 bool remarksEnabled() {
948 auto &Ctx = M.getContext();
949 return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(
DEBUG_TYPE);
953 bool run(
bool IsModulePass) {
963 Changed |= runAttributor(IsModulePass);
966 OMPInfoCache.recollectUses();
969 Changed |= rewriteDeviceCodeStateMachine();
971 if (remarksEnabled())
972 analysisGlobalization();
979 Changed |= runAttributor(IsModulePass);
982 OMPInfoCache.recollectUses();
984 Changed |= deleteParallelRegions();
987 Changed |= hideMemTransfersLatency();
988 Changed |= deduplicateRuntimeCalls();
990 if (mergeParallelRegions()) {
991 deduplicateRuntimeCalls();
997 if (OMPInfoCache.OpenMPPostLink)
998 Changed |= removeRuntimeSymbols();
1005 void printICVs()
const {
1009 for (Function *
F : SCC) {
1010 for (
auto ICV : ICVs) {
1011 auto ICVInfo = OMPInfoCache.ICVs[ICV];
1012 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
1013 return ORA <<
"OpenMP ICV " <<
ore::NV(
"OpenMPICV", ICVInfo.Name)
1015 << (ICVInfo.InitValue
1016 ?
toString(ICVInfo.InitValue->getValue(), 10,
true)
1017 :
"IMPLEMENTATION_DEFINED");
1026 void printKernels()
const {
1027 for (Function *
F : SCC) {
1031 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
1032 return ORA <<
"OpenMP GPU kernel "
1033 <<
ore::NV(
"OpenMPGPUKernel",
F->getName()) <<
"\n";
1042 static CallInst *getCallIfRegularCall(
1043 Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI =
nullptr) {
1054 static CallInst *getCallIfRegularCall(
1055 Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI =
nullptr) {
1066 bool mergeParallelRegions() {
1067 const unsigned CallbackCalleeOperand = 2;
1068 const unsigned CallbackFirstArgOperand = 3;
1072 OMPInformationCache::RuntimeFunctionInfo &RFI =
1073 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1075 if (!RFI.Declaration)
1079 OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
1080 OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],
1081 OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],
1085 LoopInfo *LI =
nullptr;
1086 DominatorTree *DT =
nullptr;
1088 SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap;
1090 BasicBlock *StartBB =
nullptr, *EndBB =
nullptr;
1091 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
1093 BasicBlock *CGStartBB = CodeGenIP.getBlock();
1095 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
1096 assert(StartBB !=
nullptr &&
"StartBB should not be null");
1098 assert(EndBB !=
nullptr &&
"EndBB should not be null");
1099 EndBB->getTerminator()->setSuccessor(0, CGEndBB);
1103 auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
Value &,
1104 Value &Inner,
Value *&ReplacementValue) -> InsertPointTy {
1105 ReplacementValue = &Inner;
1109 auto FiniCB = [&](InsertPointTy CodeGenIP) {
return Error::success(); };
1113 auto CreateSequentialRegion = [&](
Function *OuterFn,
1119 BasicBlock *ParentBB = SeqStartI->getParent();
1121 SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);
1125 SplitBlock(ParentBB, SeqStartI, DT, LI,
nullptr,
"seq.par.merged");
1128 "Expected a different CFG");
1132 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
1134 BasicBlock *CGStartBB = CodeGenIP.getBlock();
1136 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
1137 assert(SeqStartBB !=
nullptr &&
"SeqStartBB should not be null");
1139 assert(SeqEndBB !=
nullptr &&
"SeqEndBB should not be null");
1143 auto FiniCB = [&](InsertPointTy CodeGenIP) {
return Error::success(); };
1147 for (Instruction &
I : *SeqStartBB) {
1148 SmallPtrSet<Instruction *, 4> OutsideUsers;
1149 for (User *Usr :
I.users()) {
1157 OutsideUsers.
insert(&UsrI);
1160 if (OutsideUsers.
empty())
1165 const DataLayout &
DL = M.getDataLayout();
1166 AllocaInst *AllocaI =
new AllocaInst(
1167 I.getType(),
DL.getAllocaAddrSpace(),
nullptr,
1172 new StoreInst(&
I, AllocaI, SeqStartBB->getTerminator()->getIterator());
1176 for (Instruction *UsrI : OutsideUsers) {
1177 LoadInst *LoadI =
new LoadInst(
I.getType(), AllocaI,
1178 I.getName() +
".seq.output.load",
1184 OpenMPIRBuilder::LocationDescription Loc(
1185 InsertPointTy(ParentBB, ParentBB->
end()),
DL);
1187 OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB));
1189 OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel));
1204 auto Merge = [&](
const SmallVectorImpl<CallInst *> &MergableCIs,
1208 assert(MergableCIs.
size() > 1 &&
"Assumed multiple mergable CIs");
1210 auto Remark = [&](OptimizationRemark
OR) {
1211 OR <<
"Parallel region merged with parallel region"
1212 << (MergableCIs.
size() > 2 ?
"s" :
"") <<
" at ";
1215 if (CI != MergableCIs.
back())
1223 Function *OriginalFn = BB->getParent();
1225 <<
" parallel regions in " << OriginalFn->
getName()
1229 EndBB =
SplitBlock(BB, MergableCIs.
back()->getNextNode(), DT, LI);
1231 SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
1235 assert(BB->getUniqueSuccessor() == StartBB &&
"Expected a different CFG");
1236 const DebugLoc DL = BB->getTerminator()->getDebugLoc();
1241 for (
auto *It = MergableCIs.
begin(), *End = MergableCIs.
end() - 1;
1250 CreateSequentialRegion(OriginalFn, BB, ForkCI->
getNextNode(),
1254 OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),
1256 IRBuilder<>::InsertPoint AllocaIP(
1262 cantFail(OMPInfoCache.OMPBuilder.createParallel(
1263 Loc, AllocaIP, {}, BodyGenCB, PrivCB, FiniCB,
1264 nullptr,
nullptr, OMP_PROC_BIND_default,
1269 OMPInfoCache.OMPBuilder.finalize(OriginalFn);
1275 SmallVector<Value *, 8>
Args;
1276 for (
auto *CI : MergableCIs) {
1278 FunctionType *FT = OMPInfoCache.OMPBuilder.ParallelTask;
1282 for (
unsigned U = CallbackFirstArgOperand,
E = CI->
arg_size(); U <
E;
1292 for (
unsigned U = CallbackFirstArgOperand,
E = CI->
arg_size(); U <
E;
1296 U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
1299 if (CI != MergableCIs.back()) {
1302 cantFail(OMPInfoCache.OMPBuilder.createBarrier(
1311 assert(OutlinedFn != OriginalFn &&
"Outlining failed");
1312 CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn);
1313 CGUpdater.reanalyzeFunction(*OriginalFn);
1315 NumOpenMPParallelRegionsMerged += MergableCIs.size();
1323 CallInst *CI = getCallIfRegularCall(U, &RFI);
1330 RFI.foreachUse(SCC, DetectPRsCB);
1336 for (
auto &It : BB2PRMap) {
1337 auto &CIs = It.getSecond();
1352 auto IsMergable = [&](
Instruction &
I,
bool IsBeforeMergableRegion) {
1355 if (
I.isTerminator())
1362 if (IsBeforeMergableRegion) {
1364 if (!CalledFunction)
1371 for (
const auto &RFI : UnmergableCallsInfo) {
1372 if (CalledFunction == RFI.Declaration)
1387 for (
auto It = BB->
begin(), End = BB->
end(); It != End;) {
1391 if (CIs.count(&
I)) {
1397 if (IsMergable(
I, MergableCIs.
empty()))
1402 for (; It != End; ++It) {
1404 if (CIs.count(&SkipI)) {
1406 <<
" due to " <<
I <<
"\n");
1413 if (MergableCIs.
size() > 1) {
1414 MergableCIsVector.
push_back(MergableCIs);
1416 <<
" parallel regions in block " << BB->
getName()
1421 MergableCIs.
clear();
1424 if (!MergableCIsVector.
empty()) {
1427 for (
auto &MergableCIs : MergableCIsVector)
1428 Merge(MergableCIs, BB);
1429 MergableCIsVector.clear();
1436 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call);
1437 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier);
1438 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master);
1439 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master);
1446 bool deleteParallelRegions() {
1447 const unsigned CallbackCalleeOperand = 2;
1449 OMPInformationCache::RuntimeFunctionInfo &RFI =
1450 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1452 if (!RFI.Declaration)
1457 CallInst *CI = getCallIfRegularCall(U);
1464 if (!Fn->onlyReadsMemory())
1466 if (!Fn->hasFnAttribute(Attribute::WillReturn))
1472 auto Remark = [&](OptimizationRemark
OR) {
1473 return OR <<
"Removing parallel region with no side-effects.";
1479 ++NumOpenMPParallelRegionsDeleted;
1483 RFI.foreachUse(SCC, DeleteCallCB);
1489 bool deduplicateRuntimeCalls() {
1493 OMPRTL_omp_get_num_threads,
1494 OMPRTL_omp_in_parallel,
1495 OMPRTL_omp_get_cancellation,
1496 OMPRTL_omp_get_supported_active_levels,
1497 OMPRTL_omp_get_level,
1498 OMPRTL_omp_get_ancestor_thread_num,
1499 OMPRTL_omp_get_team_size,
1500 OMPRTL_omp_get_active_level,
1501 OMPRTL_omp_in_final,
1502 OMPRTL_omp_get_proc_bind,
1503 OMPRTL_omp_get_num_places,
1504 OMPRTL_omp_get_num_procs,
1505 OMPRTL_omp_get_place_num,
1506 OMPRTL_omp_get_partition_num_places,
1507 OMPRTL_omp_get_partition_place_nums};
1510 SmallSetVector<Value *, 16> GTIdArgs;
1511 collectGlobalThreadIdArguments(GTIdArgs);
1513 <<
" global thread ID arguments\n");
1515 for (Function *
F : SCC) {
1516 for (
auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
1517 Changed |= deduplicateRuntimeCalls(
1518 *
F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
1522 Value *GTIdArg =
nullptr;
1523 for (Argument &Arg :
F->args())
1524 if (GTIdArgs.
count(&Arg)) {
1528 Changed |= deduplicateRuntimeCalls(
1529 *
F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);
1536 bool removeRuntimeSymbols() {
1541 if (GlobalVariable *GV = M.getNamedGlobal(
"__llvm_rpc_client")) {
1542 if (GV->hasNUsesOrMore(1))
1546 GV->eraseFromParent();
1558 bool hideMemTransfersLatency() {
1559 auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
1562 auto *RTCall = getCallIfRegularCall(U, &RFI);
1566 OffloadArray OffloadArrays[3];
1567 if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
1570 LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));
1573 bool WasSplit =
false;
1574 Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
1575 if (WaitMovementPoint)
1576 WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);
1581 if (OMPInfoCache.runtimeFnsAvailable(
1582 {OMPRTL___tgt_target_data_begin_mapper_issue,
1583 OMPRTL___tgt_target_data_begin_mapper_wait}))
1584 RFI.foreachUse(SCC, SplitMemTransfers);
1589 void analysisGlobalization() {
1590 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
1592 auto CheckGlobalization = [&](
Use &
U,
Function &Decl) {
1593 if (CallInst *CI = getCallIfRegularCall(U, &RFI)) {
1594 auto Remark = [&](OptimizationRemarkMissed ORM) {
1596 <<
"Found thread data sharing on the GPU. "
1597 <<
"Expect degraded performance due to data globalization.";
1605 RFI.foreachUse(SCC, CheckGlobalization);
1610 bool getValuesInOffloadArrays(CallInst &RuntimeCall,
1612 assert(OAs.
size() == 3 &&
"Need space for three offload arrays!");
1622 Value *BasePtrsArg =
1634 if (!OAs[0].
initialize(*BasePtrsArray, RuntimeCall))
1642 if (!OAs[1].
initialize(*PtrsArray, RuntimeCall))
1654 if (!OAs[2].
initialize(*SizesArray, RuntimeCall))
1665 assert(OAs.
size() == 3 &&
"There are three offload arrays to debug!");
1668 std::string ValuesStr;
1669 raw_string_ostream
Printer(ValuesStr);
1670 std::string Separator =
" --- ";
1672 for (
auto *BP : OAs[0].StoredValues) {
1676 LLVM_DEBUG(
dbgs() <<
"\t\toffload_baseptrs: " << ValuesStr <<
"\n");
1679 for (
auto *
P : OAs[1].StoredValues) {
1686 for (
auto *S : OAs[2].StoredValues) {
1690 LLVM_DEBUG(
dbgs() <<
"\t\toffload_sizes: " << ValuesStr <<
"\n");
1695 Instruction *canBeMovedDownwards(CallInst &RuntimeCall) {
1700 bool IsWorthIt =
false;
1719 return RuntimeCall.
getParent()->getTerminator();
1723 bool splitTargetDataBeginRTC(CallInst &RuntimeCall,
1724 Instruction &WaitMovementPoint) {
1728 auto &
IRBuilder = OMPInfoCache.OMPBuilder;
1731 IRBuilder.Builder.SetInsertPoint(&Entry,
1732 Entry.getFirstNonPHIOrDbgOrAlloca());
1734 IRBuilder.AsyncInfo,
nullptr,
"handle");
1741 FunctionCallee IssueDecl =
IRBuilder.getOrCreateRuntimeFunction(
1742 M, OMPRTL___tgt_target_data_begin_mapper_issue);
1745 SmallVector<Value *, 16>
Args;
1746 for (
auto &Arg : RuntimeCall.
args())
1747 Args.push_back(Arg.get());
1748 Args.push_back(Handle);
1752 OMPInfoCache.setCallingConvention(IssueDecl, IssueCallsite);
1757 FunctionCallee WaitDecl =
IRBuilder.getOrCreateRuntimeFunction(
1758 M, OMPRTL___tgt_target_data_begin_mapper_wait);
1760 Value *WaitParams[2] = {
1762 OffloadArray::DeviceIDArgNum),
1766 WaitDecl, WaitParams,
"", WaitMovementPoint.
getIterator());
1767 OMPInfoCache.setCallingConvention(WaitDecl, WaitCallsite);
1772 static Value *combinedIdentStruct(
Value *CurrentIdent,
Value *NextIdent,
1773 bool GlobalOnly,
bool &SingleChoice) {
1774 if (CurrentIdent == NextIdent)
1775 return CurrentIdent;
1780 SingleChoice = !CurrentIdent;
1792 getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
1793 Function &
F,
bool GlobalOnly) {
1794 bool SingleChoice =
true;
1795 Value *Ident =
nullptr;
1797 CallInst *CI = getCallIfRegularCall(U, &RFI);
1798 if (!CI || &
F != &Caller)
1801 true, SingleChoice);
1804 RFI.foreachUse(SCC, CombineIdentStruct);
1806 if (!Ident || !SingleChoice) {
1809 if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
1811 &
F.getEntryBlock(),
F.getEntryBlock().begin()));
1814 uint32_t SrcLocStrSize;
1816 OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1817 Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc, SrcLocStrSize);
1824 bool deduplicateRuntimeCalls(Function &
F,
1825 OMPInformationCache::RuntimeFunctionInfo &RFI,
1826 Value *ReplVal =
nullptr) {
1827 auto *UV = RFI.getUseVector(
F);
1828 if (!UV || UV->size() + (ReplVal !=
nullptr) < 2)
1832 dbgs() <<
TAG <<
"Deduplicate " << UV->size() <<
" uses of " << RFI.Name
1833 << (ReplVal ?
" with an existing value\n" :
"\n") <<
"\n");
1837 "Unexpected replacement value!");
1840 auto CanBeMoved = [
this](CallBase &CB) {
1841 unsigned NumArgs = CB.arg_size();
1844 if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
1846 for (
unsigned U = 1;
U < NumArgs; ++
U)
1854 OMPInfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(
F);
1858 for (Use *U : *UV) {
1859 if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
1864 if (!CanBeMoved(*CI))
1872 assert(IP &&
"Expected insertion point!");
1882 Value *Ident = getCombinedIdentFromCallUsesIn(RFI,
F,
1890 CallInst *CI = getCallIfRegularCall(U, &RFI);
1891 if (!CI || CI == ReplVal || &
F != &Caller)
1895 auto Remark = [&](OptimizationRemark
OR) {
1896 return OR <<
"OpenMP runtime call "
1897 <<
ore::NV(
"OpenMPOptRuntime", RFI.Name) <<
" deduplicated.";
1906 ++NumOpenMPRuntimeCallsDeduplicated;
1910 RFI.foreachUse(SCC, ReplaceAndDeleteCB);
1916 void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> >IdArgs) {
1923 auto CallArgOpIsGTId = [&](
Function &
F,
unsigned ArgNo, CallInst &RefCI) {
1924 if (!
F.hasLocalLinkage())
1926 for (Use &U :
F.uses()) {
1927 if (CallInst *CI = getCallIfRegularCall(U)) {
1929 if (CI == &RefCI || GTIdArgs.
count(ArgOp) ||
1930 getCallIfRegularCall(
1931 *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))
1940 auto AddUserArgs = [&](
Value >Id) {
1941 for (Use &U : GTId.uses())
1945 if (CallArgOpIsGTId(*Callee,
U.getOperandNo(), *CI))
1950 OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
1951 OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];
1953 GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &
F) {
1954 if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))
1962 for (
unsigned U = 0;
U < GTIdArgs.
size(); ++
U)
1963 AddUserArgs(*GTIdArgs[U]);
1971 DenseMap<Function *, std::optional<Kernel>> UniqueKernelMap;
1974 Kernel getUniqueKernelFor(Function &
F);
1977 Kernel getUniqueKernelFor(Instruction &
I) {
1978 return getUniqueKernelFor(*
I.getFunction());
1983 bool rewriteDeviceCodeStateMachine();
1999 template <
typename RemarkKind,
typename RemarkCallBack>
2000 void emitRemark(Instruction *
I, StringRef RemarkName,
2001 RemarkCallBack &&RemarkCB)
const {
2003 auto &ORE = OREGetter(
F);
2007 return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
I))
2008 <<
" [" << RemarkName <<
"]";
2012 [&]() {
return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
I)); });
2016 template <
typename RemarkKind,
typename RemarkCallBack>
2017 void emitRemark(Function *
F, StringRef RemarkName,
2018 RemarkCallBack &&RemarkCB)
const {
2019 auto &ORE = OREGetter(
F);
2023 return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
F))
2024 <<
" [" << RemarkName <<
"]";
2028 [&]() {
return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
F)); });
2035 SmallVectorImpl<Function *> &SCC;
2039 CallGraphUpdater &CGUpdater;
2042 OptimizationRemarkGetter OREGetter;
2045 OMPInformationCache &OMPInfoCache;
2051 bool runAttributor(
bool IsModulePass) {
2055 registerAAs(IsModulePass);
2060 <<
" functions, result: " <<
Changed <<
".\n");
2062 if (
Changed == ChangeStatus::CHANGED)
2063 OMPInfoCache.invalidateAnalyses();
2065 return Changed == ChangeStatus::CHANGED;
2072 void registerAAs(
bool IsModulePass);
2077 static void registerAAsForFunction(Attributor &A,
const Function &
F);
2081 if (OMPInfoCache.CGSCC && !OMPInfoCache.CGSCC->empty() &&
2082 !OMPInfoCache.CGSCC->contains(&
F))
2087 std::optional<Kernel> &CachedKernel = UniqueKernelMap[&
F];
2089 return *CachedKernel;
2096 return *CachedKernel;
2099 CachedKernel =
nullptr;
2100 if (!
F.hasLocalLinkage()) {
2103 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2104 return ORA <<
"Potentially unknown OpenMP target region caller.";
2112 auto GetUniqueKernelForUse = [&](
const Use &
U) ->
Kernel {
2115 if (
Cmp->isEquality())
2116 return getUniqueKernelFor(*Cmp);
2121 if (CB->isCallee(&U))
2122 return getUniqueKernelFor(*CB);
2124 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
2125 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_60];
2127 if (OpenMPOpt::getCallIfRegularCall(*
U.getUser(), &KernelParallelRFI))
2128 return getUniqueKernelFor(*CB);
2136 SmallPtrSet<Kernel, 2> PotentialKernels;
2137 OMPInformationCache::foreachUse(
F, [&](
const Use &U) {
2138 PotentialKernels.
insert(GetUniqueKernelForUse(U));
2142 if (PotentialKernels.
size() == 1)
2143 K = *PotentialKernels.
begin();
2146 UniqueKernelMap[&
F] =
K;
2151bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
2152 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
2153 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_60];
2156 if (!KernelParallelRFI)
2163 for (Function *
F : SCC) {
2167 bool UnknownUse =
false;
2168 bool KernelParallelUse =
false;
2169 unsigned NumDirectCalls = 0;
2172 OMPInformationCache::foreachUse(*
F, [&](Use &U) {
2174 if (CB->isCallee(&U)) {
2180 ToBeReplacedStateMachineUses.
push_back(&U);
2186 OpenMPOpt::getCallIfRegularCall(*
U.getUser(), &KernelParallelRFI);
2187 const unsigned int WrapperFunctionArgNo = 6;
2188 if (!KernelParallelUse && CI &&
2190 KernelParallelUse =
true;
2191 ToBeReplacedStateMachineUses.
push_back(&U);
2199 if (!KernelParallelUse)
2205 if (UnknownUse || NumDirectCalls != 1 ||
2206 ToBeReplacedStateMachineUses.
size() > 2) {
2207 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2208 return ORA <<
"Parallel region is used in "
2209 << (UnknownUse ?
"unknown" :
"unexpected")
2210 <<
" ways. Will not attempt to rewrite the state machine.";
2220 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2221 return ORA <<
"Parallel region is not called from a unique kernel. "
2222 "Will not attempt to rewrite the state machine.";
2234 Type *Int8Ty = Type::getInt8Ty(
M.getContext());
2236 auto *
ID =
new GlobalVariable(
2240 for (Use *U : ToBeReplacedStateMachineUses)
2242 ID,
U->get()->getType()));
2244 ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
2253struct AAICVTracker :
public StateWrapper<BooleanState, AbstractAttribute> {
2254 using Base = StateWrapper<BooleanState, AbstractAttribute>;
2255 AAICVTracker(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
2258 bool isAssumedTracked()
const {
return getAssumed(); }
2261 bool isKnownTracked()
const {
return getAssumed(); }
2264 static AAICVTracker &createForPosition(
const IRPosition &IRP, Attributor &
A);
2268 const Instruction *
I,
2269 Attributor &
A)
const {
2270 return std::nullopt;
2276 virtual std::optional<Value *>
2284 StringRef
getName()
const override {
return "AAICVTracker"; }
2287 const char *getIdAddr()
const override {
return &
ID; }
2290 static bool classof(
const AbstractAttribute *AA) {
2294 static const char ID;
2297struct AAICVTrackerFunction :
public AAICVTracker {
2298 AAICVTrackerFunction(
const IRPosition &IRP, Attributor &
A)
2299 : AAICVTracker(IRP,
A) {}
2302 const std::string getAsStr(Attributor *)
const override {
2303 return "ICVTrackerFunction";
2307 void trackStatistics()
const override {}
2311 return ChangeStatus::UNCHANGED;
2316 InternalControlVar::ICV___last>
2317 ICVReplacementValuesMap;
2324 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2327 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2329 auto &ValuesMap = ICVReplacementValuesMap[ICV];
2331 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
2337 if (ValuesMap.insert(std::make_pair(CI, CI->
getArgOperand(0))).second)
2338 HasChanged = ChangeStatus::CHANGED;
2344 std::optional<Value *> ReplVal = getValueForCall(
A,
I, ICV);
2345 if (ReplVal && ValuesMap.insert(std::make_pair(&
I, *ReplVal)).second)
2346 HasChanged = ChangeStatus::CHANGED;
2352 SetterRFI.foreachUse(TrackValues,
F);
2354 bool UsedAssumedInformation =
false;
2355 A.checkForAllInstructions(CallCheck, *
this, {Instruction::Call},
2356 UsedAssumedInformation,
2362 if (HasChanged == ChangeStatus::CHANGED)
2363 ValuesMap.try_emplace(Entry);
2371 std::optional<Value *> getValueForCall(Attributor &
A,
const Instruction &
I,
2375 if (!CB || CB->hasFnAttr(
"no_openmp") ||
2376 CB->hasFnAttr(
"no_openmp_routines") ||
2377 CB->hasFnAttr(
"no_openmp_constructs"))
2378 return std::nullopt;
2380 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2381 auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
2382 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2383 Function *CalledFunction = CB->getCalledFunction();
2386 if (CalledFunction ==
nullptr)
2388 if (CalledFunction == GetterRFI.Declaration)
2389 return std::nullopt;
2390 if (CalledFunction == SetterRFI.Declaration) {
2391 if (ICVReplacementValuesMap[ICV].
count(&
I))
2392 return ICVReplacementValuesMap[ICV].lookup(&
I);
2401 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2404 if (ICVTrackingAA->isAssumedTracked()) {
2405 std::optional<Value *> URV =
2406 ICVTrackingAA->getUniqueReplacementValue(ICV);
2417 std::optional<Value *>
2419 return std::nullopt;
2424 const Instruction *
I,
2425 Attributor &
A)
const override {
2426 const auto &ValuesMap = ICVReplacementValuesMap[ICV];
2427 if (ValuesMap.count(
I))
2428 return ValuesMap.lookup(
I);
2431 SmallPtrSet<const Instruction *, 16> Visited;
2434 std::optional<Value *> ReplVal;
2436 while (!Worklist.
empty()) {
2438 if (!Visited.
insert(CurrInst).second)
2446 if (ValuesMap.count(CurrInst)) {
2447 std::optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst);
2450 ReplVal = NewReplVal;
2456 if (ReplVal != NewReplVal)
2462 std::optional<Value *> NewReplVal = getValueForCall(
A, *CurrInst, ICV);
2468 ReplVal = NewReplVal;
2474 if (ReplVal != NewReplVal)
2479 if (CurrBB ==
I->getParent() && ReplVal)
2484 if (
const Instruction *Terminator = Pred->getTerminator())
2492struct AAICVTrackerFunctionReturned : AAICVTracker {
2493 AAICVTrackerFunctionReturned(
const IRPosition &IRP, Attributor &
A)
2494 : AAICVTracker(IRP,
A) {}
2497 const std::string getAsStr(Attributor *)
const override {
2498 return "ICVTrackerFunctionReturned";
2502 void trackStatistics()
const override {}
2506 return ChangeStatus::UNCHANGED;
2511 InternalControlVar::ICV___last>
2512 ICVReplacementValuesMap;
2515 std::optional<Value *>
2517 return ICVReplacementValuesMap[ICV];
2522 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2525 if (!ICVTrackingAA->isAssumedTracked())
2526 return indicatePessimisticFixpoint();
2529 std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2530 std::optional<Value *> UniqueICVValue;
2533 std::optional<Value *> NewReplVal =
2534 ICVTrackingAA->getReplacementValue(ICV, &
I,
A);
2537 if (UniqueICVValue && UniqueICVValue != NewReplVal)
2540 UniqueICVValue = NewReplVal;
2545 bool UsedAssumedInformation =
false;
2546 if (!
A.checkForAllInstructions(CheckReturnInst, *
this, {Instruction::Ret},
2547 UsedAssumedInformation,
2549 UniqueICVValue =
nullptr;
2551 if (UniqueICVValue == ReplVal)
2554 ReplVal = UniqueICVValue;
2555 Changed = ChangeStatus::CHANGED;
2562struct AAICVTrackerCallSite : AAICVTracker {
2563 AAICVTrackerCallSite(
const IRPosition &IRP, Attributor &
A)
2564 : AAICVTracker(IRP,
A) {}
2567 assert(getAnchorScope() &&
"Expected anchor function");
2571 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2573 auto ICVInfo = OMPInfoCache.ICVs[ICV];
2574 auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter];
2575 if (Getter.Declaration == getAssociatedFunction()) {
2576 AssociatedICV = ICVInfo.Kind;
2582 indicatePessimisticFixpoint();
2586 if (!ReplVal || !*ReplVal)
2587 return ChangeStatus::UNCHANGED;
2590 A.deleteAfterManifest(*getCtxI());
2592 return ChangeStatus::CHANGED;
2596 const std::string getAsStr(Attributor *)
const override {
2597 return "ICVTrackerCallSite";
2601 void trackStatistics()
const override {}
2604 std::optional<Value *> ReplVal;
2607 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2611 if (!ICVTrackingAA->isAssumedTracked())
2612 return indicatePessimisticFixpoint();
2614 std::optional<Value *> NewReplVal =
2615 ICVTrackingAA->getReplacementValue(AssociatedICV, getCtxI(),
A);
2617 if (ReplVal == NewReplVal)
2618 return ChangeStatus::UNCHANGED;
2620 ReplVal = NewReplVal;
2621 return ChangeStatus::CHANGED;
2626 std::optional<Value *>
2632struct AAICVTrackerCallSiteReturned : AAICVTracker {
2633 AAICVTrackerCallSiteReturned(
const IRPosition &IRP, Attributor &
A)
2634 : AAICVTracker(IRP,
A) {}
2637 const std::string getAsStr(Attributor *)
const override {
2638 return "ICVTrackerCallSiteReturned";
2642 void trackStatistics()
const override {}
2646 return ChangeStatus::UNCHANGED;
2651 InternalControlVar::ICV___last>
2652 ICVReplacementValuesMap;
2656 std::optional<Value *>
2658 return ICVReplacementValuesMap[ICV];
2663 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2665 DepClassTy::REQUIRED);
2668 if (!ICVTrackingAA->isAssumedTracked())
2669 return indicatePessimisticFixpoint();
2672 std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2673 std::optional<Value *> NewReplVal =
2674 ICVTrackingAA->getUniqueReplacementValue(ICV);
2676 if (ReplVal == NewReplVal)
2679 ReplVal = NewReplVal;
2680 Changed = ChangeStatus::CHANGED;
2688static bool hasFunctionEndAsUniqueSuccessor(
const BasicBlock *BB) {
2694 return hasFunctionEndAsUniqueSuccessor(
Successor);
2697struct AAExecutionDomainFunction :
public AAExecutionDomain {
2698 AAExecutionDomainFunction(
const IRPosition &IRP, Attributor &
A)
2699 : AAExecutionDomain(IRP,
A) {}
2701 ~AAExecutionDomainFunction()
override {
delete RPOT; }
2705 assert(
F &&
"Expected anchor function");
2706 RPOT =
new ReversePostOrderTraversal<Function *>(
F);
2709 const std::string getAsStr(Attributor *)
const override {
2710 unsigned TotalBlocks = 0, InitialThreadBlocks = 0, AlignedBlocks = 0;
2711 for (
auto &It : BEDMap) {
2715 InitialThreadBlocks += It.getSecond().IsExecutedByInitialThreadOnly;
2716 AlignedBlocks += It.getSecond().IsReachedFromAlignedBarrierOnly &&
2717 It.getSecond().IsReachingAlignedBarrierOnly;
2719 return "[AAExecutionDomain] " + std::to_string(InitialThreadBlocks) +
"/" +
2720 std::to_string(AlignedBlocks) +
" of " +
2721 std::to_string(TotalBlocks) +
2722 " executed by initial thread / aligned";
2726 void trackStatistics()
const override {}
2730 for (
const BasicBlock &BB : *getAnchorScope()) {
2731 if (!isExecutedByInitialThreadOnly(BB))
2733 dbgs() <<
TAG <<
" Basic block @" << getAnchorScope()->getName() <<
" "
2734 << BB.
getName() <<
" is executed by a single thread.\n";
2743 SmallPtrSet<CallBase *, 16> DeletedBarriers;
2744 auto HandleAlignedBarrier = [&](CallBase *CB) {
2745 const ExecutionDomainTy &ED = CB ? CEDMap[{CB, PRE}] : BEDMap[
nullptr];
2746 if (!ED.IsReachedFromAlignedBarrierOnly ||
2747 ED.EncounteredNonLocalSideEffect)
2749 if (!ED.EncounteredAssumes.empty() && !
A.isModulePass())
2760 DeletedBarriers.
insert(CB);
2761 A.deleteAfterManifest(*CB);
2762 ++NumBarriersEliminated;
2763 Changed = ChangeStatus::CHANGED;
2764 }
else if (!ED.AlignedBarriers.empty()) {
2765 Changed = ChangeStatus::CHANGED;
2767 ED.AlignedBarriers.end());
2768 SmallSetVector<CallBase *, 16> Visited;
2769 while (!Worklist.
empty()) {
2771 if (!Visited.
insert(LastCB))
2775 if (!hasFunctionEndAsUniqueSuccessor(LastCB->
getParent()))
2777 if (!DeletedBarriers.
count(LastCB)) {
2778 ++NumBarriersEliminated;
2779 A.deleteAfterManifest(*LastCB);
2785 const ExecutionDomainTy &LastED = CEDMap[{LastCB, PRE}];
2786 Worklist.
append(LastED.AlignedBarriers.begin(),
2787 LastED.AlignedBarriers.end());
2793 if (!ED.EncounteredAssumes.empty() && (CB || !ED.AlignedBarriers.empty()))
2794 for (
auto *AssumeCB : ED.EncounteredAssumes)
2795 A.deleteAfterManifest(*AssumeCB);
2798 for (
auto *CB : AlignedBarriers)
2799 HandleAlignedBarrier(CB);
2803 HandleAlignedBarrier(
nullptr);
2808 bool isNoOpFence(
const FenceInst &FI)
const override {
2809 return getState().isValidState() && !NonNoOpFences.count(&FI);
2815 mergeInPredecessorBarriersAndAssumptions(Attributor &
A, ExecutionDomainTy &ED,
2816 const ExecutionDomainTy &PredED);
2821 bool mergeInPredecessor(Attributor &
A, ExecutionDomainTy &ED,
2822 const ExecutionDomainTy &PredED,
2823 bool InitialEdgeOnly =
false);
2826 bool handleCallees(Attributor &
A, ExecutionDomainTy &EntryBBED);
2833 bool isExecutedByInitialThreadOnly(
const BasicBlock &BB)
const override {
2834 if (!isValidState())
2836 assert(BB.
getParent() == getAnchorScope() &&
"Block is out of scope!");
2837 return BEDMap.lookup(&BB).IsExecutedByInitialThreadOnly;
2840 bool isExecutedInAlignedRegion(Attributor &
A,
2841 const Instruction &
I)
const override {
2842 assert(
I.getFunction() == getAnchorScope() &&
2843 "Instruction is out of scope!");
2844 if (!isValidState())
2847 bool ForwardIsOk =
true;
2856 if (CB != &
I && AlignedBarriers.contains(
const_cast<CallBase *
>(CB)))
2858 const auto &It = CEDMap.find({CB, PRE});
2859 if (It == CEDMap.end())
2861 if (!It->getSecond().IsReachingAlignedBarrierOnly)
2862 ForwardIsOk =
false;
2866 if (!CurI && !BEDMap.lookup(
I.getParent()).IsReachingAlignedBarrierOnly)
2867 ForwardIsOk =
false;
2875 if (CB != &
I && AlignedBarriers.contains(
const_cast<CallBase *
>(CB)))
2877 const auto &It = CEDMap.find({CB, POST});
2878 if (It == CEDMap.end())
2880 if (It->getSecond().IsReachedFromAlignedBarrierOnly)
2893 return BEDMap.lookup(
nullptr).IsReachedFromAlignedBarrierOnly;
2895 return BEDMap.lookup(PredBB).IsReachedFromAlignedBarrierOnly;
2905 ExecutionDomainTy getExecutionDomain(
const BasicBlock &BB)
const override {
2907 "No request should be made against an invalid state!");
2908 return BEDMap.lookup(&BB);
2910 std::pair<ExecutionDomainTy, ExecutionDomainTy>
2911 getExecutionDomain(
const CallBase &CB)
const override {
2913 "No request should be made against an invalid state!");
2914 return {CEDMap.lookup({&CB, PRE}), CEDMap.lookup({&CB, POST})};
2916 ExecutionDomainTy getFunctionExecutionDomain()
const override {
2918 "No request should be made against an invalid state!");
2919 return InterProceduralED;
2925 static bool isInitialThreadOnlyEdge(Attributor &
A, CondBrInst *
Edge,
2926 BasicBlock &SuccessorBB) {
2929 if (
Edge->getSuccessor(0) != &SuccessorBB)
2933 if (!Cmp || !
Cmp->isTrueWhenEqual() || !
Cmp->isEquality())
2941 if (
C->isAllOnesValue()) {
2943 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2944 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
2945 CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr;
2948 ConstantStruct *KernelEnvC =
2950 ConstantInt *ExecModeC =
2951 KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);
2958 if (
II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_tid_x)
2963 if (
II->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x)
2971 ExecutionDomainTy InterProceduralED;
2975 DenseMap<const BasicBlock *, ExecutionDomainTy> BEDMap;
2976 DenseMap<PointerIntPair<const CallBase *, 1, Direction>, ExecutionDomainTy>
2978 SmallSetVector<CallBase *, 16> AlignedBarriers;
2980 ReversePostOrderTraversal<Function *> *RPOT =
nullptr;
2983 static bool setAndRecord(
bool &R,
bool V) {
2991 SmallPtrSet<const FenceInst *, 8> NonNoOpFences;
2994void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions(
2995 Attributor &
A, ExecutionDomainTy &ED,
const ExecutionDomainTy &PredED) {
2996 for (
auto *EA : PredED.EncounteredAssumes)
2997 ED.addAssumeInst(
A, *EA);
2999 for (
auto *AB : PredED.AlignedBarriers)
3000 ED.addAlignedBarrier(
A, *AB);
3003bool AAExecutionDomainFunction::mergeInPredecessor(
3004 Attributor &
A, ExecutionDomainTy &ED,
const ExecutionDomainTy &PredED,
3005 bool InitialEdgeOnly) {
3009 setAndRecord(ED.IsExecutedByInitialThreadOnly,
3010 InitialEdgeOnly || (PredED.IsExecutedByInitialThreadOnly &&
3011 ED.IsExecutedByInitialThreadOnly));
3013 Changed |= setAndRecord(ED.IsReachedFromAlignedBarrierOnly,
3014 ED.IsReachedFromAlignedBarrierOnly &&
3015 PredED.IsReachedFromAlignedBarrierOnly);
3016 Changed |= setAndRecord(ED.EncounteredNonLocalSideEffect,
3017 ED.EncounteredNonLocalSideEffect |
3018 PredED.EncounteredNonLocalSideEffect);
3020 if (ED.IsReachedFromAlignedBarrierOnly)
3021 mergeInPredecessorBarriersAndAssumptions(
A, ED, PredED);
3023 ED.clearAssumeInstAndAlignedBarriers();
3027bool AAExecutionDomainFunction::handleCallees(Attributor &
A,
3028 ExecutionDomainTy &EntryBBED) {
3030 auto PredForCallSite = [&](AbstractCallSite ACS) {
3031 const auto *EDAA =
A.getAAFor<AAExecutionDomain>(
3033 DepClassTy::OPTIONAL);
3034 if (!EDAA || !EDAA->getState().isValidState())
3037 EDAA->getExecutionDomain(*
cast<CallBase>(ACS.getInstruction())));
3041 ExecutionDomainTy ExitED;
3042 bool AllCallSitesKnown;
3043 if (
A.checkForAllCallSites(PredForCallSite, *
this,
3045 AllCallSitesKnown)) {
3046 for (
const auto &[CSInED, CSOutED] : CallSiteEDs) {
3047 mergeInPredecessor(
A, EntryBBED, CSInED);
3048 ExitED.IsReachingAlignedBarrierOnly &=
3049 CSOutED.IsReachingAlignedBarrierOnly;
3056 EntryBBED.IsExecutedByInitialThreadOnly =
false;
3057 EntryBBED.IsReachedFromAlignedBarrierOnly =
true;
3058 EntryBBED.EncounteredNonLocalSideEffect =
false;
3059 ExitED.IsReachingAlignedBarrierOnly =
false;
3061 EntryBBED.IsExecutedByInitialThreadOnly =
false;
3062 EntryBBED.IsReachedFromAlignedBarrierOnly =
false;
3063 EntryBBED.EncounteredNonLocalSideEffect =
true;
3064 ExitED.IsReachingAlignedBarrierOnly =
false;
3069 auto &FnED = BEDMap[
nullptr];
3070 Changed |= setAndRecord(FnED.IsReachedFromAlignedBarrierOnly,
3071 FnED.IsReachedFromAlignedBarrierOnly &
3072 EntryBBED.IsReachedFromAlignedBarrierOnly);
3073 Changed |= setAndRecord(FnED.IsReachingAlignedBarrierOnly,
3074 FnED.IsReachingAlignedBarrierOnly &
3075 ExitED.IsReachingAlignedBarrierOnly);
3076 Changed |= setAndRecord(FnED.IsExecutedByInitialThreadOnly,
3077 EntryBBED.IsExecutedByInitialThreadOnly);
3081ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &
A) {
3088 auto HandleAlignedBarrier = [&](CallBase &CB, ExecutionDomainTy &ED) {
3089 Changed |= AlignedBarriers.insert(&CB);
3091 auto &CallInED = CEDMap[{&CB, PRE}];
3092 Changed |= mergeInPredecessor(
A, CallInED, ED);
3093 CallInED.IsReachingAlignedBarrierOnly =
true;
3095 ED.EncounteredNonLocalSideEffect =
false;
3096 ED.IsReachedFromAlignedBarrierOnly =
true;
3098 ED.clearAssumeInstAndAlignedBarriers();
3099 ED.addAlignedBarrier(
A, CB);
3100 auto &CallOutED = CEDMap[{&CB, POST}];
3101 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3105 A.getAAFor<AAIsDead>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
3112 for (
auto &RIt : *RPOT) {
3115 bool IsEntryBB = &BB == &EntryBB;
3118 bool AlignedBarrierLastInBlock = IsEntryBB && IsKernel;
3119 bool IsExplicitlyAligned = IsEntryBB && IsKernel;
3120 ExecutionDomainTy ED;
3127 if (LivenessAA && LivenessAA->isAssumedDead(&BB))
3131 if (LivenessAA && LivenessAA->isEdgeDead(PredBB, &BB))
3133 bool InitialEdgeOnly = isInitialThreadOnlyEdge(
3135 mergeInPredecessor(
A, ED, BEDMap[PredBB], InitialEdgeOnly);
3141 for (Instruction &
I : BB) {
3142 bool UsedAssumedInformation;
3143 if (
A.isAssumedDead(
I, *
this, LivenessAA, UsedAssumedInformation,
3144 false, DepClassTy::OPTIONAL,
3152 ED.addAssumeInst(
A, *AI);
3156 if (
II->isAssumeLikeIntrinsic())
3161 if (!ED.EncounteredNonLocalSideEffect) {
3163 if (ED.IsReachedFromAlignedBarrierOnly)
3168 case AtomicOrdering::NotAtomic:
3170 case AtomicOrdering::Unordered:
3172 case AtomicOrdering::Monotonic:
3174 case AtomicOrdering::Acquire:
3176 case AtomicOrdering::Release:
3178 case AtomicOrdering::AcquireRelease:
3180 case AtomicOrdering::SequentiallyConsistent:
3184 NonNoOpFences.insert(FI);
3189 bool IsAlignedBarrier =
3193 AlignedBarrierLastInBlock &= IsNoSync;
3194 IsExplicitlyAligned &= IsNoSync;
3200 if (IsAlignedBarrier) {
3201 HandleAlignedBarrier(*CB, ED);
3202 AlignedBarrierLastInBlock =
true;
3203 IsExplicitlyAligned =
true;
3209 if (!ED.EncounteredNonLocalSideEffect &&
3211 ED.EncounteredNonLocalSideEffect =
true;
3213 ED.IsReachedFromAlignedBarrierOnly =
false;
3221 auto &CallInED = CEDMap[{CB, PRE}];
3222 Changed |= mergeInPredecessor(
A, CallInED, ED);
3228 if (!IsNoSync && Callee && !
Callee->isDeclaration()) {
3229 const auto *EDAA =
A.getAAFor<AAExecutionDomain>(
3231 if (EDAA && EDAA->getState().isValidState()) {
3232 const auto &CalleeED = EDAA->getFunctionExecutionDomain();
3233 ED.IsReachedFromAlignedBarrierOnly =
3234 CalleeED.IsReachedFromAlignedBarrierOnly;
3235 AlignedBarrierLastInBlock = ED.IsReachedFromAlignedBarrierOnly;
3236 if (IsNoSync || !CalleeED.IsReachedFromAlignedBarrierOnly)
3237 ED.EncounteredNonLocalSideEffect |=
3238 CalleeED.EncounteredNonLocalSideEffect;
3240 ED.EncounteredNonLocalSideEffect =
3241 CalleeED.EncounteredNonLocalSideEffect;
3242 if (!CalleeED.IsReachingAlignedBarrierOnly) {
3244 setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3247 if (CalleeED.IsReachedFromAlignedBarrierOnly)
3248 mergeInPredecessorBarriersAndAssumptions(
A, ED, CalleeED);
3249 auto &CallOutED = CEDMap[{CB, POST}];
3250 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3255 ED.IsReachedFromAlignedBarrierOnly =
false;
3256 Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3259 AlignedBarrierLastInBlock &= ED.IsReachedFromAlignedBarrierOnly;
3261 auto &CallOutED = CEDMap[{CB, POST}];
3262 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3265 if (!
I.mayHaveSideEffects() && !
I.mayReadFromMemory())
3271 const auto *MemAA =
A.getAAFor<AAMemoryLocation>(
3279 if (MemAA && MemAA->getState().isValidState() &&
3280 MemAA->checkForAllAccessesToMemoryKind(
3285 auto &InfoCache =
A.getInfoCache();
3286 if (!
I.mayHaveSideEffects() && InfoCache.isOnlyUsedByAssume(
I))
3290 if (LI->hasMetadata(LLVMContext::MD_invariant_load))
3293 if (!ED.EncounteredNonLocalSideEffect &&
3295 ED.EncounteredNonLocalSideEffect =
true;
3298 bool IsEndAndNotReachingAlignedBarriersOnly =
false;
3300 !BB.getTerminator()->getNumSuccessors()) {
3302 Changed |= mergeInPredecessor(
A, InterProceduralED, ED);
3304 auto &FnED = BEDMap[
nullptr];
3305 if (IsKernel && !IsExplicitlyAligned)
3306 FnED.IsReachingAlignedBarrierOnly =
false;
3307 Changed |= mergeInPredecessor(
A, FnED, ED);
3309 if (!FnED.IsReachingAlignedBarrierOnly) {
3310 IsEndAndNotReachingAlignedBarriersOnly =
true;
3311 SyncInstWorklist.
push_back(BB.getTerminator());
3312 auto &BBED = BEDMap[&BB];
3313 Changed |= setAndRecord(BBED.IsReachingAlignedBarrierOnly,
false);
3317 ExecutionDomainTy &StoredED = BEDMap[&BB];
3318 ED.IsReachingAlignedBarrierOnly = StoredED.IsReachingAlignedBarrierOnly &
3319 !IsEndAndNotReachingAlignedBarriersOnly;
3325 if (ED.IsExecutedByInitialThreadOnly !=
3326 StoredED.IsExecutedByInitialThreadOnly ||
3327 ED.IsReachedFromAlignedBarrierOnly !=
3328 StoredED.IsReachedFromAlignedBarrierOnly ||
3329 ED.EncounteredNonLocalSideEffect !=
3330 StoredED.EncounteredNonLocalSideEffect)
3334 StoredED = std::move(ED);
3339 SmallSetVector<BasicBlock *, 16> Visited;
3340 while (!SyncInstWorklist.
empty()) {
3343 bool HitAlignedBarrierOrKnownEnd =
false;
3348 auto &CallOutED = CEDMap[{CB, POST}];
3349 Changed |= setAndRecord(CallOutED.IsReachingAlignedBarrierOnly,
false);
3350 auto &CallInED = CEDMap[{CB, PRE}];
3351 HitAlignedBarrierOrKnownEnd =
3352 AlignedBarriers.count(CB) || !CallInED.IsReachingAlignedBarrierOnly;
3353 if (HitAlignedBarrierOrKnownEnd)
3355 Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3357 if (HitAlignedBarrierOrKnownEnd)
3361 if (LivenessAA && LivenessAA->isEdgeDead(PredBB, SyncBB))
3363 if (!Visited.
insert(PredBB))
3365 auto &PredED = BEDMap[PredBB];
3366 if (setAndRecord(PredED.IsReachingAlignedBarrierOnly,
false)) {
3368 SyncInstWorklist.
push_back(PredBB->getTerminator());
3371 if (SyncBB != &EntryBB)
3374 setAndRecord(InterProceduralED.IsReachingAlignedBarrierOnly,
false);
3377 return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
3382struct AAHeapToShared :
public StateWrapper<BooleanState, AbstractAttribute> {
3383 using Base = StateWrapper<BooleanState, AbstractAttribute>;
3384 AAHeapToShared(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
3387 static AAHeapToShared &createForPosition(
const IRPosition &IRP,
3391 virtual bool isAssumedHeapToShared(CallBase &CB)
const = 0;
3395 virtual bool isAssumedHeapToSharedRemovedFree(CallBase &CB)
const = 0;
3398 StringRef
getName()
const override {
return "AAHeapToShared"; }
3401 const char *getIdAddr()
const override {
return &
ID; }
3405 static bool classof(
const AbstractAttribute *AA) {
3410 static const char ID;
3413struct AAHeapToSharedFunction :
public AAHeapToShared {
3414 AAHeapToSharedFunction(
const IRPosition &IRP, Attributor &
A)
3415 : AAHeapToShared(IRP,
A) {}
3417 const std::string getAsStr(Attributor *)
const override {
3418 return "[AAHeapToShared] " + std::to_string(MallocCalls.size()) +
3419 " malloc calls eligible.";
3423 void trackStatistics()
const override {}
3427 void findPotentialRemovedFreeCalls(Attributor &
A) {
3428 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3429 auto &FreeRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
3431 PotentialRemovedFreeCalls.clear();
3433 for (CallBase *CB : MallocCalls) {
3435 for (
auto *U : CB->
users()) {
3437 if (
C &&
C->getCalledFunction() == FreeRFI.Declaration)
3441 if (FreeCalls.
size() != 1)
3444 PotentialRemovedFreeCalls.insert(FreeCalls.
front());
3450 indicatePessimisticFixpoint();
3454 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3455 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3456 if (!RFI.Declaration)
3460 [](
const IRPosition &,
const AbstractAttribute *,
3461 bool &) -> std::optional<Value *> {
return nullptr; };
3464 for (User *U : RFI.Declaration->
users())
3468 MallocCalls.insert(CB);
3473 findPotentialRemovedFreeCalls(
A);
3476 bool isAssumedHeapToShared(CallBase &CB)
const override {
3477 return isValidState() && MallocCalls.count(&CB);
3480 bool isAssumedHeapToSharedRemovedFree(CallBase &CB)
const override {
3481 return isValidState() && PotentialRemovedFreeCalls.count(&CB);
3485 if (MallocCalls.empty())
3486 return ChangeStatus::UNCHANGED;
3488 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3489 auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
3493 DepClassTy::OPTIONAL);
3496 for (CallBase *CB : MallocCalls) {
3498 if (HS &&
HS->isAssumedHeapToStack(*CB))
3503 for (
auto *U : CB->
users()) {
3505 if (
C &&
C->getCalledFunction() == FreeCall.Declaration)
3508 if (FreeCalls.
size() != 1)
3515 <<
" with shared memory."
3516 <<
" Shared memory usage is limited to "
3522 <<
" with " << AllocSize->getZExtValue()
3523 <<
" bytes of shared memory\n");
3528 Type *Int8Ty = Type::getInt8Ty(
M->getContext());
3529 Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue());
3530 auto *SharedMem =
new GlobalVariable(
3534 static_cast<unsigned>(AddressSpace::Shared));
3536 SharedMem, PointerType::getUnqual(
M->getContext()));
3538 auto Remark = [&](OptimizationRemark
OR) {
3539 return OR <<
"Replaced globalized variable with "
3540 <<
ore::NV(
"SharedMemory", AllocSize->getZExtValue())
3541 << (AllocSize->isOne() ?
" byte " :
" bytes ")
3542 <<
"of shared memory.";
3544 A.emitRemark<OptimizationRemark>(CB,
"OMP111",
Remark);
3546 MaybeAlign Alignment = CB->getRetAlign();
3548 "HeapToShared on allocation without alignment attribute");
3549 SharedMem->setAlignment(*Alignment);
3552 A.deleteAfterManifest(*CB);
3553 A.deleteAfterManifest(*FreeCalls.
front());
3555 SharedMemoryUsed += AllocSize->getZExtValue();
3556 NumBytesMovedToSharedMemory = SharedMemoryUsed;
3557 Changed = ChangeStatus::CHANGED;
3564 if (MallocCalls.empty())
3565 return indicatePessimisticFixpoint();
3566 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3567 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3568 if (!RFI.Declaration)
3569 return ChangeStatus::UNCHANGED;
3573 auto NumMallocCalls = MallocCalls.size();
3576 for (User *U : RFI.Declaration->
users()) {
3578 if (CB->getCaller() !=
F)
3580 if (!MallocCalls.count(CB))
3583 MallocCalls.remove(CB);
3586 const auto *ED =
A.getAAFor<AAExecutionDomain>(
3588 if (!ED || !ED->isExecutedByInitialThreadOnly(*CB))
3589 MallocCalls.remove(CB);
3593 findPotentialRemovedFreeCalls(
A);
3595 if (NumMallocCalls != MallocCalls.size())
3596 return ChangeStatus::CHANGED;
3598 return ChangeStatus::UNCHANGED;
3602 SmallSetVector<CallBase *, 4> MallocCalls;
3604 SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls;
3606 unsigned SharedMemoryUsed = 0;
3609struct AAKernelInfo :
public StateWrapper<KernelInfoState, AbstractAttribute> {
3610 using Base = StateWrapper<KernelInfoState, AbstractAttribute>;
3611 AAKernelInfo(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
3615 static bool requiresCalleeForCallBase() {
return false; }
3618 void trackStatistics()
const override {}
3621 const std::string getAsStr(Attributor *)
const override {
3622 if (!isValidState())
3624 return std::string(SPMDCompatibilityTracker.isAssumed() ?
"SPMD"
3626 std::string(SPMDCompatibilityTracker.isAtFixpoint() ?
" [FIX]"
3628 std::string(
" #PRs: ") +
3629 (ReachedKnownParallelRegions.isValidState()
3630 ? std::to_string(ReachedKnownParallelRegions.size())
3632 ", #Unknown PRs: " +
3633 (ReachedUnknownParallelRegions.isValidState()
3634 ? std::to_string(ReachedUnknownParallelRegions.size())
3636 ", #Reaching Kernels: " +
3637 (ReachingKernelEntries.isValidState()
3638 ? std::to_string(ReachingKernelEntries.size())
3641 (ParallelLevels.isValidState()
3642 ? std::to_string(ParallelLevels.size())
3644 ", NestedPar: " + (NestedParallelism ?
"yes" :
"no");
3648 static AAKernelInfo &createForPosition(
const IRPosition &IRP, Attributor &
A);
3651 StringRef
getName()
const override {
return "AAKernelInfo"; }
3654 const char *getIdAddr()
const override {
return &
ID; }
3657 static bool classof(
const AbstractAttribute *AA) {
3661 static const char ID;
3666struct AAKernelInfoFunction : AAKernelInfo {
3667 AAKernelInfoFunction(
const IRPosition &IRP, Attributor &
A)
3668 : AAKernelInfo(IRP,
A) {}
3670 SmallPtrSet<Instruction *, 4> GuardedInstructions;
3672 SmallPtrSetImpl<Instruction *> &getGuardedInstructions() {
3673 return GuardedInstructions;
3676 void setConfigurationOfKernelEnvironment(ConstantStruct *ConfigC) {
3678 KernelEnvC, ConfigC, {KernelInfo::ConfigurationIdx});
3679 assert(NewKernelEnvC &&
"Failed to create new kernel environment");
3683#define KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MEMBER) \
3684 void set##MEMBER##OfKernelEnvironment(ConstantInt *NewVal) { \
3685 ConstantStruct *ConfigC = \
3686 KernelInfo::getConfigurationFromKernelEnvironment(KernelEnvC); \
3687 Constant *NewConfigC = ConstantFoldInsertValueInstruction( \
3688 ConfigC, NewVal, {KernelInfo::MEMBER##Idx}); \
3689 assert(NewConfigC && "Failed to create new configuration environment"); \
3690 setConfigurationOfKernelEnvironment(cast<ConstantStruct>(NewConfigC)); \
3701#undef KERNEL_ENVIRONMENT_CONFIGURATION_SETTER
3708 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3712 OMPInformationCache::RuntimeFunctionInfo &InitRFI =
3713 OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
3714 OMPInformationCache::RuntimeFunctionInfo &DeinitRFI =
3715 OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];
3719 auto StoreCallBase = [](
Use &U,
3720 OMPInformationCache::RuntimeFunctionInfo &RFI,
3722 CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI);
3724 "Unexpected use of __kmpc_target_init or __kmpc_target_deinit!");
3726 "Multiple uses of __kmpc_target_init or __kmpc_target_deinit!");
3732 StoreCallBase(U, InitRFI, KernelInitCB);
3736 DeinitRFI.foreachUse(
3738 StoreCallBase(U, DeinitRFI, KernelDeinitCB);
3744 if (!KernelInitCB || !KernelDeinitCB)
3748 ReachingKernelEntries.insert(Fn);
3749 IsKernelEntry =
true;
3757 KernelConfigurationSimplifyCB =
3759 bool &UsedAssumedInformation) -> std::optional<Constant *> {
3760 if (!isAtFixpoint()) {
3763 UsedAssumedInformation =
true;
3769 A.registerGlobalVariableSimplificationCallback(
3770 *KernelEnvGV, KernelConfigurationSimplifyCB);
3773 bool CanChangeToSPMD = OMPInfoCache.runtimeFnsAvailable(
3774 {OMPRTL___kmpc_get_hardware_thread_id_in_block,
3775 OMPRTL___kmpc_barrier_simple_spmd});
3779 KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);
3784 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
3788 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3790 setExecModeOfKernelEnvironment(AssumedExecModeC);
3797 setMinThreadsOfKernelEnvironment(ConstantInt::get(
Int32Ty, MinThreads));
3800 auto [MinTeams, MaxTeams] =
3803 setMinTeamsOfKernelEnvironment(ConstantInt::get(
Int32Ty, MinTeams));
3805 setMaxTeamsOfKernelEnvironment(ConstantInt::get(
Int32Ty, MaxTeams));
3808 KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(KernelEnvC);
3809 ConstantInt *AssumedMayUseNestedParallelismC = ConstantInt::get(
3811 setMayUseNestedParallelismOfKernelEnvironment(
3812 AssumedMayUseNestedParallelismC);
3816 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
3819 ConstantInt::get(UseGenericStateMachineC->
getIntegerType(),
false);
3820 setUseGenericStateMachineOfKernelEnvironment(
3821 AssumedUseGenericStateMachineC);
3827 if (!OMPInfoCache.RFIs[RFKind].Declaration)
3829 A.registerVirtualUseCallback(*OMPInfoCache.RFIs[RFKind].Declaration, CB);
3833 auto AddDependence = [](
Attributor &
A,
const AAKernelInfo *KI,
3850 if (SPMDCompatibilityTracker.isValidState())
3851 return AddDependence(
A,
this, QueryingAA);
3853 if (!ReachedKnownParallelRegions.isValidState())
3854 return AddDependence(
A,
this, QueryingAA);
3860 RegisterVirtualUse(OMPRTL___kmpc_get_hardware_num_threads_in_block,
3861 CustomStateMachineUseCB);
3862 RegisterVirtualUse(OMPRTL___kmpc_get_warp_size, CustomStateMachineUseCB);
3863 RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_generic,
3864 CustomStateMachineUseCB);
3865 RegisterVirtualUse(OMPRTL___kmpc_kernel_parallel,
3866 CustomStateMachineUseCB);
3867 RegisterVirtualUse(OMPRTL___kmpc_kernel_end_parallel,
3868 CustomStateMachineUseCB);
3872 if (SPMDCompatibilityTracker.isAtFixpoint())
3879 if (!SPMDCompatibilityTracker.isValidState())
3880 return AddDependence(
A,
this, QueryingAA);
3883 RegisterVirtualUse(OMPRTL___kmpc_get_hardware_thread_id_in_block,
3892 if (!SPMDCompatibilityTracker.isValidState())
3893 return AddDependence(
A,
this, QueryingAA);
3894 if (SPMDCompatibilityTracker.empty())
3895 return AddDependence(
A,
this, QueryingAA);
3896 if (!mayContainParallelRegion())
3897 return AddDependence(
A,
this, QueryingAA);
3900 RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_spmd, SPMDBarrierUseCB);
3904 static std::string sanitizeForGlobalName(std::string S) {
3908 return !((C >=
'a' && C <=
'z') || (C >=
'A' && C <=
'Z') ||
3909 (C >=
'0' && C <=
'9') || C ==
'_');
3920 if (!KernelInitCB || !KernelDeinitCB)
3921 return ChangeStatus::UNCHANGED;
3925 bool HasBuiltStateMachine =
true;
3926 if (!changeToSPMDMode(
A,
Changed)) {
3928 HasBuiltStateMachine = buildCustomStateMachine(
A,
Changed);
3930 HasBuiltStateMachine =
false;
3934 ConstantStruct *ExistingKernelEnvC =
3936 ConstantInt *OldUseGenericStateMachineVal =
3937 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
3938 ExistingKernelEnvC);
3939 if (!HasBuiltStateMachine)
3940 setUseGenericStateMachineOfKernelEnvironment(
3941 OldUseGenericStateMachineVal);
3944 GlobalVariable *KernelEnvGV =
3948 Changed = ChangeStatus::CHANGED;
3954 void insertInstructionGuardsHelper(Attributor &
A) {
3955 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3957 auto CreateGuardedRegion = [&](
Instruction *RegionStartI,
3959 LoopInfo *LI =
nullptr;
3960 DominatorTree *DT =
nullptr;
3961 MemorySSAUpdater *MSU =
nullptr;
3991 DT, LI, MSU,
"region.guarded.end");
3994 MSU,
"region.barrier");
3997 DT, LI, MSU,
"region.exit");
3999 SplitBlock(ParentBB, RegionStartI, DT, LI, MSU,
"region.guarded");
4002 "Expected a different CFG");
4005 ParentBB, ParentBB->
getTerminator(), DT, LI, MSU,
"region.check.tid");
4008 A.registerManifestAddedBasicBlock(*RegionEndBB);
4009 A.registerManifestAddedBasicBlock(*RegionBarrierBB);
4010 A.registerManifestAddedBasicBlock(*RegionExitBB);
4011 A.registerManifestAddedBasicBlock(*RegionStartBB);
4012 A.registerManifestAddedBasicBlock(*RegionCheckTidBB);
4014 bool HasBroadcastValues =
false;
4017 for (Instruction &
I : *RegionStartBB) {
4019 for (Use &U :
I.uses()) {
4025 if (OutsideUses.
empty())
4028 HasBroadcastValues =
true;
4032 auto *SharedMem =
new GlobalVariable(
4033 M,
I.getType(),
false,
4035 sanitizeForGlobalName(
4036 (
I.getName() +
".guarded.output.alloc").str()),
4038 static_cast<unsigned>(AddressSpace::Shared));
4041 new StoreInst(&
I, SharedMem,
4044 LoadInst *LoadI =
new LoadInst(
4045 I.getType(), SharedMem,
I.getName() +
".guarded.output.load",
4049 for (Use *U : OutsideUses)
4050 A.changeUseAfterManifest(*U, *LoadI);
4053 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4058 OpenMPIRBuilder::LocationDescription Loc(
4059 InsertPointTy(ParentBB, ParentBB->
end()),
DL);
4061 uint32_t SrcLocStrSize;
4070 OpenMPIRBuilder::LocationDescription LocRegionCheckTid(
4071 InsertPointTy(RegionCheckTidBB, RegionCheckTidBB->
end()),
DL);
4073 FunctionCallee HardwareTidFn =
4075 M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
4079 OMPInfoCache.setCallingConvention(HardwareTidFn, Tid);
4081 OMPInfoCache.OMPBuilder.
Builder
4082 .
CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB)
4087 FunctionCallee BarrierFn =
4089 M, OMPRTL___kmpc_barrier_simple_spmd);
4095 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4098 if (HasBroadcastValues) {
4103 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4107 auto &AllocSharedRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
4108 SmallPtrSet<BasicBlock *, 8> Visited;
4109 for (Instruction *GuardedI : SPMDCompatibilityTracker) {
4111 if (!Visited.
insert(BB).second)
4117 while (++IP != IPEnd) {
4118 if (!IP->mayHaveSideEffects() && !IP->mayReadFromMemory())
4121 if (OpenMPOpt::getCallIfRegularCall(*
I, &AllocSharedRFI))
4123 if (!
I->user_empty() || !SPMDCompatibilityTracker.contains(
I)) {
4124 LastEffect =
nullptr;
4131 for (
auto &Reorder : Reorders)
4132 Reorder.first->moveBefore(Reorder.second->getIterator());
4137 for (Instruction *GuardedI : SPMDCompatibilityTracker) {
4139 auto *CalleeAA =
A.lookupAAFor<AAKernelInfo>(
4142 assert(CalleeAA !=
nullptr &&
"Expected Callee AAKernelInfo");
4145 if (CalleeAAFunction.getGuardedInstructions().contains(GuardedI))
4148 Instruction *GuardedRegionStart =
nullptr, *GuardedRegionEnd =
nullptr;
4149 for (Instruction &
I : *BB) {
4152 if (SPMDCompatibilityTracker.contains(&
I)) {
4153 CalleeAAFunction.getGuardedInstructions().insert(&
I);
4154 if (GuardedRegionStart)
4155 GuardedRegionEnd = &
I;
4157 GuardedRegionStart = GuardedRegionEnd = &
I;
4164 if (GuardedRegionStart) {
4166 std::make_pair(GuardedRegionStart, GuardedRegionEnd));
4167 GuardedRegionStart =
nullptr;
4168 GuardedRegionEnd =
nullptr;
4173 for (
auto &GR : GuardedRegions)
4174 CreateGuardedRegion(GR.first, GR.second);
4177 void forceSingleThreadPerWorkgroupHelper(Attributor &
A) {
4186 auto &Ctx = getAnchorValue().getContext();
4193 KernelInitCB->
getNextNode(),
"main.thread.user_code");
4198 A.registerManifestAddedBasicBlock(*InitBB);
4199 A.registerManifestAddedBasicBlock(*UserCodeBB);
4200 A.registerManifestAddedBasicBlock(*ReturnBB);
4209 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4210 FunctionCallee ThreadIdInBlockFn =
4212 M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
4215 CallInst *ThreadIdInBlock =
4217 OMPInfoCache.setCallingConvention(ThreadIdInBlockFn, ThreadIdInBlock);
4223 ConstantInt::get(ThreadIdInBlock->
getType(), 0),
4224 "thread.is_main", InitBB);
4230 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4232 if (!SPMDCompatibilityTracker.isAssumed()) {
4233 for (Instruction *NonCompatibleI : SPMDCompatibilityTracker) {
4234 if (!NonCompatibleI)
4239 if (OMPInfoCache.RTLFunctions.contains(CB->getCalledFunction()))
4242 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
4243 ORA <<
"Value has potential side effects preventing SPMD-mode "
4246 ORA <<
". Add `[[omp::assume(\"ompx_spmd_amenable\")]]` to "
4247 "the called function to override";
4251 A.emitRemark<OptimizationRemarkAnalysis>(NonCompatibleI,
"OMP121",
4255 << *NonCompatibleI <<
"\n");
4267 Kernel = CB->getCaller();
4272 ConstantStruct *ExistingKernelEnvC =
4275 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC);
4281 Changed = ChangeStatus::CHANGED;
4285 if (mayContainParallelRegion())
4286 insertInstructionGuardsHelper(
A);
4288 forceSingleThreadPerWorkgroupHelper(
A);
4293 "Initially non-SPMD kernel has SPMD exec mode!");
4294 setExecModeOfKernelEnvironment(
4298 ++NumOpenMPTargetRegionKernelsSPMD;
4300 auto Remark = [&](OptimizationRemark
OR) {
4301 return OR <<
"Transformed generic-mode kernel to SPMD-mode.";
4303 A.emitRemark<OptimizationRemark>(KernelInitCB,
"OMP120",
Remark);
4313 if (!ReachedKnownParallelRegions.isValidState())
4316 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4317 if (!OMPInfoCache.runtimeFnsAvailable(
4318 {OMPRTL___kmpc_get_hardware_num_threads_in_block,
4319 OMPRTL___kmpc_get_warp_size, OMPRTL___kmpc_barrier_simple_generic,
4320 OMPRTL___kmpc_kernel_parallel, OMPRTL___kmpc_kernel_end_parallel}))
4323 ConstantStruct *ExistingKernelEnvC =
4330 ConstantInt *UseStateMachineC =
4331 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
4332 ExistingKernelEnvC);
4333 ConstantInt *ModeC =
4334 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC);
4339 if (UseStateMachineC->
isZero() ||
4343 Changed = ChangeStatus::CHANGED;
4346 setUseGenericStateMachineOfKernelEnvironment(
4353 if (!mayContainParallelRegion()) {
4354 ++NumOpenMPTargetRegionKernelsWithoutStateMachine;
4356 auto Remark = [&](OptimizationRemark
OR) {
4357 return OR <<
"Removing unused state machine from generic-mode kernel.";
4359 A.emitRemark<OptimizationRemark>(KernelInitCB,
"OMP130",
Remark);
4365 if (ReachedUnknownParallelRegions.empty()) {
4366 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback;
4368 auto Remark = [&](OptimizationRemark
OR) {
4369 return OR <<
"Rewriting generic-mode kernel with a customized state "
4372 A.emitRemark<OptimizationRemark>(KernelInitCB,
"OMP131",
Remark);
4374 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback;
4376 auto Remark = [&](OptimizationRemarkAnalysis
OR) {
4377 return OR <<
"Generic-mode kernel is executed with a customized state "
4378 "machine that requires a fallback.";
4380 A.emitRemark<OptimizationRemarkAnalysis>(KernelInitCB,
"OMP132",
Remark);
4383 for (CallBase *UnknownParallelRegionCB : ReachedUnknownParallelRegions) {
4384 if (!UnknownParallelRegionCB)
4386 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
4387 return ORA <<
"Call may contain unknown parallel regions. Use "
4388 <<
"`[[omp::assume(\"omp_no_parallelism\")]]` to "
4391 A.emitRemark<OptimizationRemarkAnalysis>(UnknownParallelRegionCB,
4426 auto &Ctx = getAnchorValue().getContext();
4430 BasicBlock *InitBB = KernelInitCB->getParent();
4432 KernelInitCB->getNextNode(),
"thread.user_code.check");
4436 Ctx,
"worker_state_machine.begin",
Kernel, UserCodeEntryBB);
4438 Ctx,
"worker_state_machine.finished",
Kernel, UserCodeEntryBB);
4440 Ctx,
"worker_state_machine.is_active.check",
Kernel, UserCodeEntryBB);
4443 Kernel, UserCodeEntryBB);
4446 Kernel, UserCodeEntryBB);
4448 Ctx,
"worker_state_machine.done.barrier",
Kernel, UserCodeEntryBB);
4449 A.registerManifestAddedBasicBlock(*InitBB);
4450 A.registerManifestAddedBasicBlock(*UserCodeEntryBB);
4451 A.registerManifestAddedBasicBlock(*IsWorkerCheckBB);
4452 A.registerManifestAddedBasicBlock(*StateMachineBeginBB);
4453 A.registerManifestAddedBasicBlock(*StateMachineFinishedBB);
4454 A.registerManifestAddedBasicBlock(*StateMachineIsActiveCheckBB);
4455 A.registerManifestAddedBasicBlock(*StateMachineIfCascadeCurrentBB);
4456 A.registerManifestAddedBasicBlock(*StateMachineEndParallelBB);
4457 A.registerManifestAddedBasicBlock(*StateMachineDoneBarrierBB);
4459 const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
4465 ConstantInt::getAllOnesValue(KernelInitCB->getType()),
4466 "thread.is_worker", InitBB);
4471 FunctionCallee BlockHwSizeFn =
4473 M, OMPRTL___kmpc_get_hardware_num_threads_in_block);
4474 FunctionCallee WarpSizeFn =
4476 M, OMPRTL___kmpc_get_warp_size);
4477 CallInst *BlockHwSize =
4479 OMPInfoCache.setCallingConvention(BlockHwSizeFn, BlockHwSize);
4481 CallInst *WarpSize =
4483 OMPInfoCache.setCallingConvention(WarpSizeFn, WarpSize);
4486 BlockHwSize, WarpSize,
"block.size", IsWorkerCheckBB);
4490 "thread.is_main_or_worker", IsWorkerCheckBB);
4493 StateMachineFinishedBB, IsWorkerCheckBB);
4496 const DataLayout &
DL =
M.getDataLayout();
4497 Type *VoidPtrTy = PointerType::getUnqual(Ctx);
4499 new AllocaInst(VoidPtrTy,
DL.getAllocaAddrSpace(),
nullptr,
4504 OpenMPIRBuilder::LocationDescription(
4505 IRBuilder<>::InsertPoint(StateMachineBeginBB,
4506 StateMachineBeginBB->
end()),
4509 Value *Ident = KernelInfo::getIdentFromKernelEnvironment(KernelEnvC);
4510 Value *GTid = KernelInitCB;
4512 FunctionCallee BarrierFn =
4514 M, OMPRTL___kmpc_barrier_simple_generic);
4517 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4521 (
unsigned int)AddressSpace::Generic) {
4522 WorkFnAI =
new AddrSpaceCastInst(
4523 WorkFnAI, PointerType::get(Ctx, (
unsigned int)AddressSpace::Generic),
4524 WorkFnAI->
getName() +
".generic", StateMachineBeginBB);
4528 FunctionCallee KernelParallelFn =
4530 M, OMPRTL___kmpc_kernel_parallel);
4532 KernelParallelFn, {WorkFnAI},
"worker.is_active", StateMachineBeginBB);
4533 OMPInfoCache.setCallingConvention(KernelParallelFn, IsActiveWorker);
4535 Instruction *WorkFn =
new LoadInst(VoidPtrTy, WorkFnAI,
"worker.work_fn",
4536 StateMachineBeginBB);
4539 FunctionType *ParallelRegionFnTy = FunctionType::get(
4540 Type::getVoidTy(Ctx), {Type::getInt16Ty(Ctx), Type::getInt32Ty(Ctx)},
4546 StateMachineBeginBB);
4547 IsDone->setDebugLoc(DLoc);
4549 StateMachineIsActiveCheckBB, StateMachineBeginBB)
4550 ->setDebugLoc(DLoc);
4553 StateMachineDoneBarrierBB, StateMachineIsActiveCheckBB)
4554 ->setDebugLoc(DLoc);
4559 const unsigned int WrapperFunctionArgNo = 6;
4564 for (
int I = 0,
E = ReachedKnownParallelRegions.size();
I <
E; ++
I) {
4565 auto *CB = ReachedKnownParallelRegions[
I];
4567 CB->getArgOperand(WrapperFunctionArgNo)->stripPointerCasts());
4569 Ctx,
"worker_state_machine.parallel_region.execute",
Kernel,
4570 StateMachineEndParallelBB);
4572 ->setDebugLoc(DLoc);
4574 ->setDebugLoc(DLoc);
4578 Kernel, StateMachineEndParallelBB);
4579 A.registerManifestAddedBasicBlock(*PRExecuteBB);
4580 A.registerManifestAddedBasicBlock(*PRNextBB);
4585 if (
I + 1 <
E || !ReachedUnknownParallelRegions.empty()) {
4588 "worker.check_parallel_region", StateMachineIfCascadeCurrentBB);
4596 StateMachineIfCascadeCurrentBB)
4597 ->setDebugLoc(DLoc);
4598 StateMachineIfCascadeCurrentBB = PRNextBB;
4604 if (!ReachedUnknownParallelRegions.empty()) {
4605 StateMachineIfCascadeCurrentBB->
setName(
4606 "worker_state_machine.parallel_region.fallback.execute");
4608 StateMachineIfCascadeCurrentBB)
4609 ->setDebugLoc(DLoc);
4612 StateMachineIfCascadeCurrentBB)
4613 ->setDebugLoc(DLoc);
4615 FunctionCallee EndParallelFn =
4617 M, OMPRTL___kmpc_kernel_end_parallel);
4618 CallInst *EndParallel =
4620 OMPInfoCache.setCallingConvention(EndParallelFn, EndParallel);
4623 ->setDebugLoc(DLoc);
4626 ->setDebugLoc(DLoc);
4628 ->setDebugLoc(DLoc);
4636 KernelInfoState StateBefore = getState();
4642 struct UpdateKernelEnvCRAII {
4643 AAKernelInfoFunction &AA;
4645 UpdateKernelEnvCRAII(AAKernelInfoFunction &AA) : AA(AA) {}
4647 ~UpdateKernelEnvCRAII() {
4651 ConstantStruct *ExistingKernelEnvC =
4654 if (!AA.isValidState()) {
4655 AA.KernelEnvC = ExistingKernelEnvC;
4659 if (!AA.ReachedKnownParallelRegions.isValidState())
4660 AA.setUseGenericStateMachineOfKernelEnvironment(
4661 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
4662 ExistingKernelEnvC));
4664 if (!AA.SPMDCompatibilityTracker.isValidState())
4665 AA.setExecModeOfKernelEnvironment(
4666 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC));
4668 ConstantInt *MayUseNestedParallelismC =
4669 KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(
4671 ConstantInt *NewMayUseNestedParallelismC = ConstantInt::get(
4672 MayUseNestedParallelismC->
getIntegerType(), AA.NestedParallelism);
4673 AA.setMayUseNestedParallelismOfKernelEnvironment(
4674 NewMayUseNestedParallelismC);
4684 if (!
I.mayWriteToMemory())
4687 const auto *UnderlyingObjsAA =
A.getAAFor<AAUnderlyingObjects>(
4689 DepClassTy::OPTIONAL);
4690 auto *
HS =
A.getAAFor<AAHeapToStack>(
4692 DepClassTy::OPTIONAL);
4693 if (UnderlyingObjsAA &&
4694 UnderlyingObjsAA->forallUnderlyingObjects([&](
Value &Obj) {
4695 if (AA::isAssumedThreadLocalObject(A, Obj, *this))
4699 auto *CB = dyn_cast<CallBase>(&Obj);
4700 return CB && HS && HS->isAssumedHeapToStack(*CB);
4706 SPMDCompatibilityTracker.insert(&
I);
4710 bool UsedAssumedInformationInCheckRWInst =
false;
4711 if (!SPMDCompatibilityTracker.isAtFixpoint())
4712 if (!
A.checkForAllReadWriteInstructions(
4713 CheckRWInst, *
this, UsedAssumedInformationInCheckRWInst))
4714 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4716 bool UsedAssumedInformationFromReachingKernels =
false;
4717 if (!IsKernelEntry) {
4718 updateParallelLevels(
A);
4720 bool AllReachingKernelsKnown =
true;
4721 updateReachingKernelEntries(
A, AllReachingKernelsKnown);
4722 UsedAssumedInformationFromReachingKernels = !AllReachingKernelsKnown;
4724 if (!SPMDCompatibilityTracker.empty()) {
4725 if (!ParallelLevels.isValidState())
4726 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4727 else if (!ReachingKernelEntries.isValidState())
4728 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4734 for (
auto *
Kernel : ReachingKernelEntries) {
4735 auto *CBAA =
A.getAAFor<AAKernelInfo>(
4737 if (CBAA && CBAA->SPMDCompatibilityTracker.isValidState() &&
4738 CBAA->SPMDCompatibilityTracker.isAssumed())
4742 if (!CBAA || !CBAA->SPMDCompatibilityTracker.isAtFixpoint())
4743 UsedAssumedInformationFromReachingKernels =
true;
4745 if (SPMD != 0 &&
Generic != 0)
4746 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4752 bool AllParallelRegionStatesWereFixed =
true;
4753 bool AllSPMDStatesWereFixed =
true;
4756 auto *CBAA =
A.getAAFor<AAKernelInfo>(
4760 getState() ^= CBAA->getState();
4761 AllSPMDStatesWereFixed &= CBAA->SPMDCompatibilityTracker.isAtFixpoint();
4762 AllParallelRegionStatesWereFixed &=
4763 CBAA->ReachedKnownParallelRegions.isAtFixpoint();
4764 AllParallelRegionStatesWereFixed &=
4765 CBAA->ReachedUnknownParallelRegions.isAtFixpoint();
4769 bool UsedAssumedInformationInCheckCallInst =
false;
4770 if (!
A.checkForAllCallLikeInstructions(
4771 CheckCallInst, *
this, UsedAssumedInformationInCheckCallInst)) {
4773 <<
"Failed to visit all call-like instructions!\n";);
4774 return indicatePessimisticFixpoint();
4779 if (!UsedAssumedInformationInCheckCallInst &&
4780 AllParallelRegionStatesWereFixed) {
4781 ReachedKnownParallelRegions.indicateOptimisticFixpoint();
4782 ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
4787 if (!UsedAssumedInformationInCheckRWInst &&
4788 !UsedAssumedInformationInCheckCallInst &&
4789 !UsedAssumedInformationFromReachingKernels && AllSPMDStatesWereFixed)
4790 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
4792 return StateBefore == getState() ? ChangeStatus::UNCHANGED
4793 : ChangeStatus::CHANGED;
4798 void updateReachingKernelEntries(Attributor &
A,
4799 bool &AllReachingKernelsKnown) {
4800 auto PredCallSite = [&](AbstractCallSite ACS) {
4803 assert(Caller &&
"Caller is nullptr");
4805 auto *CAA =
A.getOrCreateAAFor<AAKernelInfo>(
4807 if (CAA && CAA->ReachingKernelEntries.isValidState()) {
4808 ReachingKernelEntries ^= CAA->ReachingKernelEntries;
4814 ReachingKernelEntries.indicatePessimisticFixpoint();
4819 if (!
A.checkForAllCallSites(PredCallSite, *
this,
4821 AllReachingKernelsKnown))
4822 ReachingKernelEntries.indicatePessimisticFixpoint();
4826 void updateParallelLevels(Attributor &
A) {
4827 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4828 OMPInformationCache::RuntimeFunctionInfo &Parallel60RFI =
4829 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_60];
4831 auto PredCallSite = [&](AbstractCallSite ACS) {
4834 assert(Caller &&
"Caller is nullptr");
4838 if (CAA && CAA->ParallelLevels.isValidState()) {
4844 if (Caller == Parallel60RFI.Declaration) {
4845 ParallelLevels.indicatePessimisticFixpoint();
4849 ParallelLevels ^= CAA->ParallelLevels;
4856 ParallelLevels.indicatePessimisticFixpoint();
4861 bool AllCallSitesKnown =
true;
4862 if (!
A.checkForAllCallSites(PredCallSite, *
this,
4865 ParallelLevels.indicatePessimisticFixpoint();
4872struct AAKernelInfoCallSite : AAKernelInfo {
4873 AAKernelInfoCallSite(
const IRPosition &IRP, Attributor &
A)
4874 : AAKernelInfo(IRP,
A) {}
4878 AAKernelInfo::initialize(
A);
4881 auto *AssumptionAA =
A.getAAFor<AAAssumptionInfo>(
4885 if (AssumptionAA && AssumptionAA->hasAssumption(
"ompx_spmd_amenable")) {
4886 indicateOptimisticFixpoint();
4894 indicateOptimisticFixpoint();
4903 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4904 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
4905 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
4907 if (!Callee || !
A.isFunctionIPOAmendable(*Callee)) {
4911 if (!AssumptionAA ||
4912 !(AssumptionAA->hasAssumption(
"omp_no_openmp") ||
4913 AssumptionAA->hasAssumption(
"omp_no_parallelism")))
4914 ReachedUnknownParallelRegions.insert(&CB);
4918 if (!SPMDCompatibilityTracker.isAtFixpoint()) {
4919 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4920 SPMDCompatibilityTracker.insert(&CB);
4925 indicateOptimisticFixpoint();
4931 if (NumCallees > 1) {
4932 indicatePessimisticFixpoint();
4939 case OMPRTL___kmpc_is_spmd_exec_mode:
4940 case OMPRTL___kmpc_distribute_static_fini:
4941 case OMPRTL___kmpc_for_static_fini:
4942 case OMPRTL___kmpc_global_thread_num:
4943 case OMPRTL___kmpc_get_hardware_num_threads_in_block:
4944 case OMPRTL___kmpc_get_hardware_num_blocks:
4945 case OMPRTL___kmpc_single:
4946 case OMPRTL___kmpc_end_single:
4947 case OMPRTL___kmpc_master:
4948 case OMPRTL___kmpc_end_master:
4949 case OMPRTL___kmpc_barrier:
4950 case OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2:
4951 case OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2:
4952 case OMPRTL___kmpc_error:
4953 case OMPRTL___kmpc_flush:
4954 case OMPRTL___kmpc_get_hardware_thread_id_in_block:
4955 case OMPRTL___kmpc_get_warp_size:
4956 case OMPRTL_omp_get_thread_num:
4957 case OMPRTL_omp_get_num_threads:
4958 case OMPRTL_omp_get_max_threads:
4959 case OMPRTL_omp_in_parallel:
4960 case OMPRTL_omp_get_dynamic:
4961 case OMPRTL_omp_get_cancellation:
4962 case OMPRTL_omp_get_nested:
4963 case OMPRTL_omp_get_schedule:
4964 case OMPRTL_omp_get_thread_limit:
4965 case OMPRTL_omp_get_supported_active_levels:
4966 case OMPRTL_omp_get_max_active_levels:
4967 case OMPRTL_omp_get_level:
4968 case OMPRTL_omp_get_ancestor_thread_num:
4969 case OMPRTL_omp_get_team_size:
4970 case OMPRTL_omp_get_active_level:
4971 case OMPRTL_omp_in_final:
4972 case OMPRTL_omp_get_proc_bind:
4973 case OMPRTL_omp_get_num_places:
4974 case OMPRTL_omp_get_num_procs:
4975 case OMPRTL_omp_get_place_proc_ids:
4976 case OMPRTL_omp_get_place_num:
4977 case OMPRTL_omp_get_partition_num_places:
4978 case OMPRTL_omp_get_partition_place_nums:
4979 case OMPRTL_omp_get_wtime:
4981 case OMPRTL___kmpc_distribute_static_init_4:
4982 case OMPRTL___kmpc_distribute_static_init_4u:
4983 case OMPRTL___kmpc_distribute_static_init_8:
4984 case OMPRTL___kmpc_distribute_static_init_8u:
4985 case OMPRTL___kmpc_for_static_init_4:
4986 case OMPRTL___kmpc_for_static_init_4u:
4987 case OMPRTL___kmpc_for_static_init_8:
4988 case OMPRTL___kmpc_for_static_init_8u: {
4990 unsigned ScheduleArgOpNo = 2;
4991 auto *ScheduleTypeCI =
4993 unsigned ScheduleTypeVal =
4994 ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0;
4996 case OMPScheduleType::UnorderedStatic:
4997 case OMPScheduleType::UnorderedStaticChunked:
4998 case OMPScheduleType::OrderedDistribute:
4999 case OMPScheduleType::OrderedDistributeChunked:
5002 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5003 SPMDCompatibilityTracker.insert(&CB);
5007 case OMPRTL___kmpc_target_init:
5010 case OMPRTL___kmpc_target_deinit:
5011 KernelDeinitCB = &CB;
5013 case OMPRTL___kmpc_parallel_60:
5014 if (!handleParallel60(
A, CB))
5015 indicatePessimisticFixpoint();
5017 case OMPRTL___kmpc_omp_task:
5019 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5020 SPMDCompatibilityTracker.insert(&CB);
5021 ReachedUnknownParallelRegions.insert(&CB);
5023 case OMPRTL___kmpc_alloc_shared:
5024 case OMPRTL___kmpc_free_shared:
5027 case OMPRTL___kmpc_distribute_static_loop_4:
5028 case OMPRTL___kmpc_distribute_static_loop_4u:
5029 case OMPRTL___kmpc_distribute_static_loop_8:
5030 case OMPRTL___kmpc_distribute_static_loop_8u:
5031 case OMPRTL___kmpc_distribute_for_static_loop_4:
5032 case OMPRTL___kmpc_distribute_for_static_loop_4u:
5033 case OMPRTL___kmpc_distribute_for_static_loop_8:
5034 case OMPRTL___kmpc_distribute_for_static_loop_8u:
5035 case OMPRTL___kmpc_for_static_loop_4:
5036 case OMPRTL___kmpc_for_static_loop_4u:
5037 case OMPRTL___kmpc_for_static_loop_8:
5038 case OMPRTL___kmpc_for_static_loop_8u:
5042 ReachedUnknownParallelRegions.insert(&CB);
5047 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5048 SPMDCompatibilityTracker.insert(&CB);
5053 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5054 SPMDCompatibilityTracker.insert(&CB);
5060 indicateOptimisticFixpoint();
5064 A.getAAFor<AACallEdges>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
5065 if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) {
5066 CheckCallee(getAssociatedFunction(), 1);
5069 const auto &OptimisticEdges = AACE->getOptimisticEdges();
5070 for (
auto *Callee : OptimisticEdges) {
5071 CheckCallee(Callee, OptimisticEdges.size());
5082 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
5083 KernelInfoState StateBefore = getState();
5085 auto CheckCallee = [&](
Function *
F,
int NumCallees) {
5086 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(
F);
5090 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
5093 A.getAAFor<AAKernelInfo>(*
this, FnPos, DepClassTy::REQUIRED);
5095 return indicatePessimisticFixpoint();
5096 if (getState() == FnAA->getState())
5097 return ChangeStatus::UNCHANGED;
5098 getState() = FnAA->getState();
5099 return ChangeStatus::CHANGED;
5102 return indicatePessimisticFixpoint();
5105 if (It->getSecond() == OMPRTL___kmpc_parallel_60) {
5106 if (!handleParallel60(
A, CB))
5107 return indicatePessimisticFixpoint();
5108 return StateBefore == getState() ? ChangeStatus::UNCHANGED
5109 : ChangeStatus::CHANGED;
5115 (It->getSecond() == OMPRTL___kmpc_alloc_shared ||
5116 It->getSecond() == OMPRTL___kmpc_free_shared) &&
5117 "Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call");
5119 auto *HeapToStackAA =
A.getAAFor<AAHeapToStack>(
5121 auto *HeapToSharedAA =
A.getAAFor<AAHeapToShared>(
5129 case OMPRTL___kmpc_alloc_shared:
5130 if ((!HeapToStackAA || !HeapToStackAA->isAssumedHeapToStack(CB)) &&
5131 (!HeapToSharedAA || !HeapToSharedAA->isAssumedHeapToShared(CB)))
5132 SPMDCompatibilityTracker.insert(&CB);
5134 case OMPRTL___kmpc_free_shared:
5135 if ((!HeapToStackAA ||
5136 !HeapToStackAA->isAssumedHeapToStackRemovedFree(CB)) &&
5138 !HeapToSharedAA->isAssumedHeapToSharedRemovedFree(CB)))
5139 SPMDCompatibilityTracker.insert(&CB);
5142 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5143 SPMDCompatibilityTracker.insert(&CB);
5145 return ChangeStatus::CHANGED;
5149 A.getAAFor<AACallEdges>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
5150 if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) {
5151 if (Function *
F = getAssociatedFunction())
5154 const auto &OptimisticEdges = AACE->getOptimisticEdges();
5155 for (
auto *Callee : OptimisticEdges) {
5156 CheckCallee(Callee, OptimisticEdges.size());
5162 return StateBefore == getState() ? ChangeStatus::UNCHANGED
5163 : ChangeStatus::CHANGED;
5168 bool handleParallel60(Attributor &
A, CallBase &CB) {
5169 const unsigned int NonWrapperFunctionArgNo = 5;
5170 const unsigned int WrapperFunctionArgNo = 6;
5171 auto ParallelRegionOpArgNo = SPMDCompatibilityTracker.isAssumed()
5172 ? NonWrapperFunctionArgNo
5173 : WrapperFunctionArgNo;
5177 if (!ParallelRegion)
5180 ReachedKnownParallelRegions.insert(&CB);
5182 auto *FnAA =
A.getAAFor<AAKernelInfo>(
5184 NestedParallelism |= !FnAA || !FnAA->getState().isValidState() ||
5185 !FnAA->ReachedKnownParallelRegions.empty() ||
5186 !FnAA->ReachedKnownParallelRegions.isValidState() ||
5187 !FnAA->ReachedUnknownParallelRegions.isValidState() ||
5188 !FnAA->ReachedUnknownParallelRegions.empty();
5193struct AAFoldRuntimeCall
5194 :
public StateWrapper<BooleanState, AbstractAttribute> {
5195 using Base = StateWrapper<BooleanState, AbstractAttribute>;
5197 AAFoldRuntimeCall(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
5200 void trackStatistics()
const override {}
5203 static AAFoldRuntimeCall &createForPosition(
const IRPosition &IRP,
5207 StringRef
getName()
const override {
return "AAFoldRuntimeCall"; }
5210 const char *getIdAddr()
const override {
return &
ID; }
5214 static bool classof(
const AbstractAttribute *AA) {
5218 static const char ID;
5221struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
5222 AAFoldRuntimeCallCallSiteReturned(
const IRPosition &IRP, Attributor &
A)
5223 : AAFoldRuntimeCall(IRP,
A) {}
5226 const std::string getAsStr(Attributor *)
const override {
5227 if (!isValidState())
5230 std::string Str(
"simplified value: ");
5232 if (!SimplifiedValue)
5233 return Str + std::string(
"none");
5235 if (!*SimplifiedValue)
5236 return Str + std::string(
"nullptr");
5239 return Str + std::to_string(CI->getSExtValue());
5241 return Str + std::string(
"unknown");
5246 indicatePessimisticFixpoint();
5250 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
5251 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
5252 assert(It != OMPInfoCache.RuntimeFunctionIDMap.end() &&
5253 "Expected a known OpenMP runtime function");
5255 RFKind = It->getSecond();
5258 A.registerSimplificationCallback(
5260 [&](
const IRPosition &IRP,
const AbstractAttribute *AA,
5261 bool &UsedAssumedInformation) -> std::optional<Value *> {
5262 assert((isValidState() || SimplifiedValue ==
nullptr) &&
5263 "Unexpected invalid state!");
5265 if (!isAtFixpoint()) {
5266 UsedAssumedInformation =
true;
5268 A.recordDependence(*
this, *AA, DepClassTy::OPTIONAL);
5270 return SimplifiedValue;
5277 case OMPRTL___kmpc_is_spmd_exec_mode:
5280 case OMPRTL___kmpc_parallel_level:
5283 case OMPRTL___kmpc_get_hardware_num_threads_in_block:
5284 Changed =
Changed | foldKernelFnAttribute(
A,
"omp_target_thread_limit");
5286 case OMPRTL___kmpc_get_hardware_num_blocks:
5299 if (SimplifiedValue && *SimplifiedValue) {
5302 A.deleteAfterManifest(
I);
5305 auto Remark = [&](OptimizationRemark
OR) {
5307 return OR <<
"Replacing OpenMP runtime call "
5309 <<
ore::NV(
"FoldedValue",
C->getZExtValue()) <<
".";
5310 return OR <<
"Replacing OpenMP runtime call "
5315 A.emitRemark<OptimizationRemark>(CB,
"OMP180",
Remark);
5318 << **SimplifiedValue <<
"\n");
5320 Changed = ChangeStatus::CHANGED;
5327 SimplifiedValue =
nullptr;
5328 return AAFoldRuntimeCall::indicatePessimisticFixpoint();
5334 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5336 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
5337 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
5338 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5341 if (!CallerKernelInfoAA ||
5342 !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5343 return indicatePessimisticFixpoint();
5345 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5347 DepClassTy::REQUIRED);
5349 if (!AA || !AA->isValidState()) {
5350 SimplifiedValue =
nullptr;
5351 return indicatePessimisticFixpoint();
5354 if (AA->SPMDCompatibilityTracker.isAssumed()) {
5355 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5360 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5361 ++KnownNonSPMDCount;
5363 ++AssumedNonSPMDCount;
5367 if ((AssumedSPMDCount + KnownSPMDCount) &&
5368 (AssumedNonSPMDCount + KnownNonSPMDCount))
5369 return indicatePessimisticFixpoint();
5371 auto &Ctx = getAnchorValue().getContext();
5372 if (KnownSPMDCount || AssumedSPMDCount) {
5373 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
5374 "Expected only SPMD kernels!");
5377 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx),
true);
5378 }
else if (KnownNonSPMDCount || AssumedNonSPMDCount) {
5379 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
5380 "Expected only non-SPMD kernels!");
5383 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx),
false);
5388 assert(!SimplifiedValue &&
"SimplifiedValue should be none");
5391 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5392 : ChangeStatus::CHANGED;
5397 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5399 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5402 if (!CallerKernelInfoAA ||
5403 !CallerKernelInfoAA->ParallelLevels.isValidState())
5404 return indicatePessimisticFixpoint();
5406 if (!CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5407 return indicatePessimisticFixpoint();
5409 if (CallerKernelInfoAA->ReachingKernelEntries.empty()) {
5410 assert(!SimplifiedValue &&
5411 "SimplifiedValue should keep none at this point");
5412 return ChangeStatus::UNCHANGED;
5415 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
5416 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
5417 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5419 DepClassTy::REQUIRED);
5420 if (!AA || !AA->SPMDCompatibilityTracker.isValidState())
5421 return indicatePessimisticFixpoint();
5423 if (AA->SPMDCompatibilityTracker.isAssumed()) {
5424 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5429 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5430 ++KnownNonSPMDCount;
5432 ++AssumedNonSPMDCount;
5436 if ((AssumedSPMDCount + KnownSPMDCount) &&
5437 (AssumedNonSPMDCount + KnownNonSPMDCount))
5438 return indicatePessimisticFixpoint();
5440 auto &Ctx = getAnchorValue().getContext();
5444 if (AssumedSPMDCount || KnownSPMDCount) {
5445 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
5446 "Expected only SPMD kernels!");
5447 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 1);
5449 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
5450 "Expected only non-SPMD kernels!");
5451 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 0);
5453 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5454 : ChangeStatus::CHANGED;
5457 ChangeStatus foldKernelFnAttribute(Attributor &
A, llvm::StringRef Attr) {
5459 int32_t CurrentAttrValue = -1;
5460 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5462 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5465 if (!CallerKernelInfoAA ||
5466 !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5467 return indicatePessimisticFixpoint();
5470 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5471 int32_t NextAttrVal =
K->getFnAttributeAsParsedInteger(Attr, -1);
5473 if (NextAttrVal == -1 ||
5474 (CurrentAttrValue != -1 && CurrentAttrValue != NextAttrVal))
5475 return indicatePessimisticFixpoint();
5476 CurrentAttrValue = NextAttrVal;
5479 if (CurrentAttrValue != -1) {
5480 auto &Ctx = getAnchorValue().getContext();
5482 ConstantInt::get(Type::getInt32Ty(Ctx), CurrentAttrValue);
5484 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5485 : ChangeStatus::CHANGED;
5491 std::optional<Value *> SimplifiedValue;
5501 auto &RFI = OMPInfoCache.RFIs[RF];
5502 RFI.foreachUse(SCC, [&](Use &U, Function &
F) {
5503 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &RFI);
5506 A.getOrCreateAAFor<AAFoldRuntimeCall>(
5508 DepClassTy::NONE,
false,
5514void OpenMPOpt::registerAAs(
bool IsModulePass) {
5524 A.getOrCreateAAFor<AAKernelInfo>(
5526 DepClassTy::NONE,
false,
5530 OMPInformationCache::RuntimeFunctionInfo &InitRFI =
5531 OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
5532 InitRFI.foreachUse(SCC, CreateKernelInfoCB);
5534 registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode);
5535 registerFoldRuntimeCall(OMPRTL___kmpc_parallel_level);
5536 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_threads_in_block);
5537 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_blocks);
5542 for (
int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) {
5545 auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
5548 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
5555 A.getOrCreateAAFor<AAICVTracker>(CBPos);
5559 GetterRFI.foreachUse(SCC, CreateAA);
5568 for (
auto *
F : SCC) {
5569 if (
F->isDeclaration())
5575 if (
F->hasLocalLinkage()) {
5577 const auto *CB = dyn_cast<CallBase>(U.getUser());
5578 return CB && CB->isCallee(&U) &&
5579 A.isRunOn(const_cast<Function *>(CB->getCaller()));
5583 registerAAsForFunction(
A, *
F);
5587void OpenMPOpt::registerAAsForFunction(Attributor &
A,
const Function &
F) {
5593 if (
F.hasFnAttribute(Attribute::Convergent))
5598 bool UsedAssumedInformation =
false;
5601 A.getOrCreateAAFor<AAAddressSpace>(
5607 A.getOrCreateAAFor<AAIndirectCallInfo>(
5612 A.getOrCreateAAFor<AAAddressSpace>(
5621 if (
II->getIntrinsicID() == Intrinsic::assume) {
5622 A.getOrCreateAAFor<AAPotentialValues>(
5630const char AAICVTracker::ID = 0;
5631const char AAKernelInfo::ID = 0;
5633const char AAHeapToShared::ID = 0;
5634const char AAFoldRuntimeCall::ID = 0;
5636AAICVTracker &AAICVTracker::createForPosition(
const IRPosition &IRP,
5638 AAICVTracker *AA =
nullptr;
5646 AA =
new (
A.Allocator) AAICVTrackerFunctionReturned(IRP,
A);
5649 AA =
new (
A.Allocator) AAICVTrackerCallSiteReturned(IRP,
A);
5652 AA =
new (
A.Allocator) AAICVTrackerCallSite(IRP,
A);
5655 AA =
new (
A.Allocator) AAICVTrackerFunction(IRP,
A);
5664 AAExecutionDomainFunction *
AA =
nullptr;
5674 "AAExecutionDomain can only be created for function position!");
5676 AA =
new (
A.Allocator) AAExecutionDomainFunction(IRP,
A);
5683AAHeapToShared &AAHeapToShared::createForPosition(
const IRPosition &IRP,
5685 AAHeapToSharedFunction *
AA =
nullptr;
5695 "AAHeapToShared can only be created for function position!");
5697 AA =
new (
A.Allocator) AAHeapToSharedFunction(IRP,
A);
5704AAKernelInfo &AAKernelInfo::createForPosition(
const IRPosition &IRP,
5706 AAKernelInfo *AA =
nullptr;
5716 AA =
new (
A.Allocator) AAKernelInfoCallSite(IRP,
A);
5719 AA =
new (
A.Allocator) AAKernelInfoFunction(IRP,
A);
5726AAFoldRuntimeCall &AAFoldRuntimeCall::createForPosition(
const IRPosition &IRP,
5728 AAFoldRuntimeCall *AA =
nullptr;
5737 llvm_unreachable(
"KernelInfo can only be created for call site position!");
5739 AA =
new (
A.Allocator) AAFoldRuntimeCallCallSiteReturned(IRP,
A);
5760 if (Kernels.contains(&
F))
5762 return !
F.use_empty();
5769 return ORA <<
"Could not internalize function. "
5770 <<
"Some optimizations may not be possible. [OMP140]";
5782 if (!
F.isDeclaration() && !Kernels.contains(&
F) && IsCalled(
F) &&
5786 }
else if (!
F.hasLocalLinkage() && !
F.hasFnAttribute(Attribute::Cold)) {
5799 if (!
F.isDeclaration() && !InternalizedMap.
lookup(&
F)) {
5801 Functions.insert(&
F);
5819 OMPInformationCache InfoCache(M, AG, Allocator,
nullptr, PostLink);
5821 unsigned MaxFixpointIterations =
5833 return F.hasFnAttribute(
"kernel");
5838 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache,
A);
5844 if (!
F.isDeclaration() && !Kernels.contains(&
F) &&
5845 !
F.hasFnAttribute(Attribute::NoInline))
5846 F.addFnAttr(Attribute::AlwaysInline);
5876 Module &M = *
C.begin()->getFunction().getParent();
5898 OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
5899 &Functions, PostLink);
5901 unsigned MaxFixpointIterations =
5915 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache,
A);
5916 bool Changed = OMPOpt.run(
false);
5935 if (
F.hasKernelCallingConv()) {
5940 ++NumOpenMPTargetRegionKernels;
5943 ++NumNonOpenMPTargetRegionKernels;
5950 Metadata *MD = M.getModuleFlag(
"openmp");
5958 Metadata *MD = M.getModuleFlag(
"openmp-device");
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu next use AMDGPU Next Use Analysis Printer
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Expand Atomic instructions
static cl::opt< unsigned > SetFixpointIterations("attributor-max-iterations", cl::Hidden, cl::desc("Maximal number of fixpoint iterations."), cl::init(32))
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file provides interfaces used to manipulate a call graph, regardless if it is a "old style" Call...
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file defines the DenseSet and SmallDenseSet classes.
This file defines an array type that can be indexed using scoped enum values.
static void emitRemark(const Function &F, OptimizationRemarkEmitter &ORE, bool Skip)
Loop::LoopBounds::Direction Direction
Machine Check Debug Module
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
This file defines constans and helpers used when dealing with OpenMP.
This file defines constans that will be used by both host and device compilation.
static constexpr auto TAG
static cl::opt< bool > HideMemoryTransferLatency("openmp-hide-memory-transfer-latency", cl::desc("[WIP] Tries to hide the latency of host to device memory" " transfers"), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptStateMachineRewrite("openmp-opt-disable-state-machine-rewrite", cl::desc("Disable OpenMP optimizations that replace the state machine."), cl::Hidden, cl::init(false))
static cl::opt< bool > EnableParallelRegionMerging("openmp-opt-enable-merging", cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintModuleAfterOptimizations("openmp-opt-print-module-after", cl::desc("Print the current module after OpenMP optimizations."), cl::Hidden, cl::init(false))
#define KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MEMBER)
#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX)
#define KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MEMBER)
static cl::opt< bool > PrintOpenMPKernels("openmp-print-gpu-kernels", cl::init(false), cl::Hidden)
static cl::opt< bool > DisableOpenMPOptFolding("openmp-opt-disable-folding", cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintModuleBeforeOptimizations("openmp-opt-print-module-before", cl::desc("Print the current module before OpenMP optimizations."), cl::Hidden, cl::init(false))
static cl::opt< unsigned > SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden, cl::desc("Maximal number of attributor iterations."), cl::init(256))
static cl::opt< bool > DisableInternalization("openmp-opt-disable-internalization", cl::desc("Disable function internalization."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintICVValues("openmp-print-icv-values", cl::init(false), cl::Hidden)
static cl::opt< bool > DisableOpenMPOptimizations("openmp-opt-disable", cl::desc("Disable OpenMP specific optimizations."), cl::Hidden, cl::init(false))
static cl::opt< unsigned > SharedMemoryLimit("openmp-opt-shared-limit", cl::Hidden, cl::desc("Maximum amount of shared memory to use."), cl::init(std::numeric_limits< unsigned >::max()))
static cl::opt< bool > EnableVerboseRemarks("openmp-opt-verbose-remarks", cl::desc("Enables more verbose remarks."), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptDeglobalization("openmp-opt-disable-deglobalization", cl::desc("Disable OpenMP optimizations involving deglobalization."), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptBarrierElimination("openmp-opt-disable-barrier-elimination", cl::desc("Disable OpenMP optimizations that eliminate barriers."), cl::Hidden, cl::init(false))
static cl::opt< bool > DeduceICVValues("openmp-deduce-icv-values", cl::init(false), cl::Hidden)
#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX)
#define KERNEL_ENVIRONMENT_GETTER(MEMBER, RETURNTYPE)
static cl::opt< bool > DisableOpenMPOptSPMDization("openmp-opt-disable-spmdization", cl::desc("Disable OpenMP optimizations involving SPMD-ization."), cl::Hidden, cl::init(false))
static cl::opt< bool > AlwaysInlineDeviceFunctions("openmp-opt-inline-device", cl::desc("Inline all applicable functions on the device."), cl::Hidden, cl::init(false))
FunctionAnalysisManager FAM
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static StringRef getName(Value *V)
std::pair< BasicBlock *, BasicBlock * > Edge
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static const int BlockSize
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, const llvm::StringTable &StandardNames, VectorLibrary VecLib)
Initialize the set of available library functions based on the specified target triple.
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
size_t size() const
size - Get the array size.
iterator begin()
Instruction iterator methods.
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
reverse_iterator rbegin()
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
InstListType::reverse_iterator reverse_iterator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
void setCallingConv(CallingConv::ID CC)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool doesNotAccessMemory(unsigned OpNo) const
LLVM_ABI bool isIndirectCall() const
Return true if the callsite is an indirect call.
bool isCallee(Value::const_user_iterator UI) const
Determine whether the passed iterator points to the callee operand's Use.
Value * getArgOperand(unsigned i) const
void setArgOperand(unsigned i, Value *v)
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
unsigned arg_size() const
AttributeList getAttributes() const
Return the attributes for this call.
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool isArgOperand(const Use *U) const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
LLVM_ABI Function * getCaller()
Helper to get the caller (the parent function).
Wrapper to unify "old style" CallGraph and "new style" LazyCallGraph.
void initialize(LazyCallGraph &LCG, LazyCallGraph::SCC &SCC, CGSCCAnalysisManager &AM, CGSCCUpdateResult &UR)
Initializers for usage outside of a CGSCC pass, inside a CGSCC pass in the old and new pass manager (...
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
@ ICMP_SLT
signed less than
static CondBrInst * Create(Value *Cond, BasicBlock *IfTrue, BasicBlock *IfFalse, InsertPosition InsertBefore=nullptr)
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
This is the shared class of boolean and integer constants.
IntegerType * getIntegerType() const
Variant of the getType() method to always return an IntegerType, which reduces the amount of casting ...
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
This is an important base class in LLVM.
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
LLVM_ABI Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const
Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...
static ErrorSuccess success()
Create a success value.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this fence instruction.
A proxy from a FunctionAnalysisManager to an SCC.
const BasicBlock & getEntryBlock() const
const BasicBlock & front() const
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Argument * getArg(unsigned i) const
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
bool hasLocalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
@ PrivateLinkage
Like Internal, but omit from symbol table.
@ InternalLinkage
Rename collisions when linking (static functions).
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
LLVM_ABI void setInitializer(Constant *InitVal)
setInitializer - Sets the initializer for this global variable, removing any existing initializer if ...
BasicBlock * getBlock() const
CondBrInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateIsNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg == 0.
LLVM_ABI bool isLifetimeStartOrEnd() const LLVM_READONLY
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
LLVM_ABI bool mayReadFromMemory() const LLVM_READONLY
Return true if this instruction may read memory.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void setSuccessor(unsigned Idx, BasicBlock *BB)
Update the specified successor to point at the provided block.
A node in the call graph.
An SCC of the call graph.
A lazily constructed view of the call graph of a module.
LLVM_ABI void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
LLVM_ABI StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
A Module instance is used to store all the information related to an LLVM module.
const Triple & getTargetTriple() const
Get the target triple which is a string describing the target host.
LLVM_ABI Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
LLVM_ABI FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
static LLVM_ABI std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
LLVM_ABI Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
static LLVM_ABI std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM, LazyCallGraph &CG, CGSCCUpdateResult &UR)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
static ReturnInst * Create(LLVMContext &C, Value *retVal=nullptr, InsertPosition InsertBefore=nullptr)
A vector that has set insertion semantics.
size_type size() const
Determine the number of elements in the SetVector.
size_type count(const_arg_type key) const
Count the number of elements of a given key in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Triple - Helper class for working with autoconf configuration names.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static UncondBrInst * Create(BasicBlock *Target, InsertPosition InsertBefore=nullptr)
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Type * getType() const
All values are typed, get the type of this value.
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
GlobalVariable * getKernelEnvironementGVFromKernelInitCB(CallBase *KernelInitCB)
ConstantStruct * getKernelEnvironementFromKernelInitCB(CallBase *KernelInitCB)
Abstract Attribute helper functions.
LLVM_ABI bool isValidAtPosition(const ValueAndContext &VAC, InformationCache &InfoCache)
Return true if the value of VAC is a valid at the position of VAC, that is a constant,...
LLVM_ABI bool isPotentiallyAffectedByBarrier(Attributor &A, const Instruction &I, const AbstractAttribute &QueryingAA)
Return true if I is potentially affected by a barrier.
LLVM_ABI bool isNoSyncInst(Attributor &A, const Instruction &I, const AbstractAttribute &QueryingAA)
Return true if I is a nosync instruction.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
E & operator^=(E &LHS, E RHS)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ BasicBlock
Various leaf nodes.
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
constexpr uint64_t PointerSize
aarch64 pointer size.
bool isOpenMPDevice(Module &M)
Helper to determine if M is a OpenMP target offloading device module.
bool containsOpenMP(Module &M)
Helper to determine if M contains OpenMP.
InternalControlVar
IDs for all Internal Control Variables (ICVs).
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
KernelSet getDeviceKernels(Module &M)
Get OpenMP device kernels in M.
@ OMP_TGT_EXEC_MODE_GENERIC_SPMD
@ OMP_TGT_EXEC_MODE_GENERIC
SetVector< Kernel > KernelSet
Set of kernels in the module.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
bool isOpenMPKernel(Function &Fn)
Return true iff Fn is an OpenMP GPU kernel; Fn has the "kernel" attribute.
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< UseNode * > Use
friend class Instruction
Iterator for Instructions in a `BasicBlock.
LLVM_ABI iterator begin() const
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool succ_empty(const Instruction *I)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
bool operator!=(uint64_t V1, const APInt &V2)
constexpr from_range_t from_range
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
AnalysisManager< LazyCallGraph::SCC, LazyCallGraph & > CGSCCAnalysisManager
The CGSCC analysis manager.
@ ThinLTOPostLink
ThinLTO postlink (backend compile) phase.
@ FullLTOPostLink
Full LTO postlink (backend compile) phase.
@ ThinLTOPreLink
ThinLTO prelink (summary) phase.
auto dyn_cast_or_null(const Y &Val)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
ArrayRef(const T &OneElt) -> ArrayRef< T >
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto predecessors(const MachineBasicBlock *BB)
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
Attempt to constant fold an insertvalue instruction with the specified operands and indices.
@ OPTIONAL
The target may be valid if the source is not.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
BumpPtrAllocatorImpl<> BumpPtrAllocator
The standard BumpPtrAllocator which just uses the default template parameters.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
static LLVM_ABI AAExecutionDomain & createForPosition(const IRPosition &IRP, Attributor &A)
Create an abstract attribute view for the position IRP.
AAExecutionDomain(const IRPosition &IRP, Attributor &A)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
AccessKind
Simple enum to distinguish read/write/read-write accesses.
StateType::base_t MemoryLocationsKind
static LLVM_ABI bool isAlignedBarrier(const CallBase &CB, bool ExecutedAligned)
Helper function to determine if CB is an aligned (GPU) barrier.
Base struct for all "concrete attribute" deductions.
virtual const char * getIdAddr() const =0
This function should return the address of the ID of the AbstractAttribute.
An interface to query the internal state of an abstract attribute.
Wrapper for FunctionAnalysisManager.
Configuration for the Attributor.
std::function< void(Attributor &A, const Function &F)> InitializationCallback
Callback function to be invoked on internal functions marked live.
std::optional< unsigned > MaxFixpointIterations
Maximum number of iterations to run until fixpoint.
bool RewriteSignatures
Flag to determine if we rewrite function signatures.
OptimizationRemarkGetter OREGetter
IPOAmendableCBTy IPOAmendableCB
bool IsModulePass
Is the user of the Attributor a module pass or not.
bool DefaultInitializeLiveInternals
Flag to determine if we want to initialize all default AAs for an internal function marked live.
The fixpoint analysis framework that orchestrates the attribute deduction.
static LLVM_ABI bool isInternalizable(Function &F)
Returns true if the function F can be internalized.
std::function< std::optional< Value * >( const IRPosition &, const AbstractAttribute *, bool &)> SimplifictionCallbackTy
Register CB as a simplification callback.
std::function< std::optional< Constant * >( const GlobalVariable &, const AbstractAttribute *, bool &)> GlobalVariableSimplifictionCallbackTy
Register CB as a simplification callback.
std::function< bool(Attributor &, const AbstractAttribute *)> VirtualUseCallbackTy
static LLVM_ABI bool internalizeFunctions(SmallPtrSetImpl< Function * > &FnSet, DenseMap< Function *, Function * > &FnMap)
Make copies of each function in the set FnSet such that the copied version has internal linkage after...
Simple wrapper for a single bit (boolean) state.
Support structure for SCC passes to communicate updates the call graph back to the CGSCC pass manager...
Helper to describe and deal with positions in the LLVM-IR.
static const IRPosition callsite_returned(const CallBase &CB)
Create a position describing the returned value of CB.
static const IRPosition returned(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the returned value of F.
static const IRPosition value(const Value &V, const CallBaseContext *CBContext=nullptr)
Create a position describing the value of V.
static const IRPosition inst(const Instruction &I, const CallBaseContext *CBContext=nullptr)
Create a position describing the instruction I.
@ IRP_ARGUMENT
An attribute for a function argument.
@ IRP_RETURNED
An attribute for the function return value.
@ IRP_CALL_SITE
An attribute for a call site (function scope).
@ IRP_CALL_SITE_RETURNED
An attribute for a call site return value.
@ IRP_FUNCTION
An attribute for a function (scope).
@ IRP_FLOAT
A position that is not associated with a spot suitable for attributes.
@ IRP_CALL_SITE_ARGUMENT
An attribute for a call site argument.
@ IRP_INVALID
An invalid position.
static const IRPosition function(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the function scope of F.
Kind getPositionKind() const
Return the associated position kind.
static const IRPosition callsite_function(const CallBase &CB)
Create a position describing the function scope of CB.
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...