LLVM  12.0.0git
OpenMPOpt.cpp
Go to the documentation of this file.
1 //===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // OpenMP specific optimizations:
10 //
11 // - Deduplication of runtime calls, e.g., omp_get_thread_num.
12 //
13 //===----------------------------------------------------------------------===//
14 
16 
18 #include "llvm/ADT/Statistic.h"
24 #include "llvm/InitializePasses.h"
26 #include "llvm/Transforms/IPO.h"
29 
30 using namespace llvm;
31 using namespace omp;
32 
33 #define DEBUG_TYPE "openmp-opt"
34 
36  "openmp-opt-disable", cl::ZeroOrMore,
37  cl::desc("Disable OpenMP specific optimizations."), cl::Hidden,
38  cl::init(false));
39 
40 static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
41  cl::Hidden);
42 static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
43  cl::init(false), cl::Hidden);
44 
45 STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
46  "Number of OpenMP runtime calls deduplicated");
47 STATISTIC(NumOpenMPParallelRegionsDeleted,
48  "Number of OpenMP parallel regions deleted");
49 STATISTIC(NumOpenMPRuntimeFunctionsIdentified,
50  "Number of OpenMP runtime functions identified");
51 STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified,
52  "Number of OpenMP runtime function uses identified");
53 STATISTIC(NumOpenMPTargetRegionKernels,
54  "Number of OpenMP target region entry points (=kernels) identified");
55 STATISTIC(
56  NumOpenMPParallelRegionsReplacedInGPUStateMachine,
57  "Number of OpenMP parallel regions replaced with ID in GPU state machines");
58 
59 #if !defined(NDEBUG)
60 static constexpr auto TAG = "[" DEBUG_TYPE "]";
61 #endif
62 
63 /// Apply \p CB to all uses of \p F. If \p LookThroughConstantExprUses is
64 /// true, constant expression users are not given to \p CB but their uses are
65 /// traversed transitively.
66 template <typename CBTy>
67 static void foreachUse(Function &F, CBTy CB,
68  bool LookThroughConstantExprUses = true) {
70 
71  for (unsigned idx = 0; idx < Worklist.size(); ++idx) {
72  Use &U = *Worklist[idx];
73 
74  // Allow use in constant bitcasts and simply look through them.
75  if (LookThroughConstantExprUses && isa<ConstantExpr>(U.getUser())) {
76  for (Use &CEU : cast<ConstantExpr>(U.getUser())->uses())
77  Worklist.push_back(&CEU);
78  continue;
79  }
80 
81  CB(U);
82  }
83 }
84 
85 /// Helper struct to store tracked ICV values at specif instructions.
86 struct ICVValue {
89 
90  ICVValue(Instruction *I, Value *Val) : Inst(I), TrackedValue(Val) {}
91 };
92 
93 namespace llvm {
94 
95 // Provide DenseMapInfo for ICVValue
96 template <> struct DenseMapInfo<ICVValue> {
99 
100  static inline ICVValue getEmptyKey() {
101  return ICVValue(InstInfo::getEmptyKey(), ValueInfo::getEmptyKey());
102  };
103 
104  static inline ICVValue getTombstoneKey() {
105  return ICVValue(InstInfo::getTombstoneKey(), ValueInfo::getTombstoneKey());
106  };
107 
108  static unsigned getHashValue(const ICVValue &ICVVal) {
110  InstInfo::getHashValue(ICVVal.Inst),
111  ValueInfo::getHashValue(ICVVal.TrackedValue));
112  }
113 
114  static bool isEqual(const ICVValue &LHS, const ICVValue &RHS) {
115  return InstInfo::isEqual(LHS.Inst, RHS.Inst) &&
117  }
118 };
119 
120 } // end namespace llvm
121 
122 namespace {
123 
124 struct AAICVTracker;
125 
126 /// OpenMP specific information. For now, stores RFIs and ICVs also needed for
127 /// Attributor runs.
128 struct OMPInformationCache : public InformationCache {
129  OMPInformationCache(Module &M, AnalysisGetter &AG,
132  : InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M),
133  Kernels(Kernels) {
134  initializeModuleSlice(CGSCC);
135 
136  OMPBuilder.initialize();
137  initializeRuntimeFunctions();
138  initializeInternalControlVars();
139  }
140 
141  /// Generic information that describes an internal control variable.
142  struct InternalControlVarInfo {
143  /// The kind, as described by InternalControlVar enum.
145 
146  /// The name of the ICV.
147  StringRef Name;
148 
149  /// Environment variable associated with this ICV.
150  StringRef EnvVarName;
151 
152  /// Initial value kind.
153  ICVInitValue InitKind;
154 
155  /// Initial value.
156  ConstantInt *InitValue;
157 
158  /// Setter RTL function associated with this ICV.
159  RuntimeFunction Setter;
160 
161  /// Getter RTL function associated with this ICV.
162  RuntimeFunction Getter;
163 
164  /// RTL Function corresponding to the override clause of this ICV
165  RuntimeFunction Clause;
166  };
167 
168  /// Generic information that describes a runtime function
169  struct RuntimeFunctionInfo {
170 
171  /// The kind, as described by the RuntimeFunction enum.
173 
174  /// The name of the function.
175  StringRef Name;
176 
177  /// Flag to indicate a variadic function.
178  bool IsVarArg;
179 
180  /// The return type of the function.
181  Type *ReturnType;
182 
183  /// The argument types of the function.
184  SmallVector<Type *, 8> ArgumentTypes;
185 
186  /// The declaration if available.
187  Function *Declaration = nullptr;
188 
189  /// Uses of this runtime function per function containing the use.
190  using UseVector = SmallVector<Use *, 16>;
191 
192  /// Clear UsesMap for runtime function.
193  void clearUsesMap() { UsesMap.clear(); }
194 
195  /// Boolean conversion that is true if the runtime function was found.
196  operator bool() const { return Declaration; }
197 
198  /// Return the vector of uses in function \p F.
199  UseVector &getOrCreateUseVector(Function *F) {
200  std::shared_ptr<UseVector> &UV = UsesMap[F];
201  if (!UV)
202  UV = std::make_shared<UseVector>();
203  return *UV;
204  }
205 
206  /// Return the vector of uses in function \p F or `nullptr` if there are
207  /// none.
208  const UseVector *getUseVector(Function &F) const {
209  auto I = UsesMap.find(&F);
210  if (I != UsesMap.end())
211  return I->second.get();
212  return nullptr;
213  }
214 
215  /// Return how many functions contain uses of this runtime function.
216  size_t getNumFunctionsWithUses() const { return UsesMap.size(); }
217 
218  /// Return the number of arguments (or the minimal number for variadic
219  /// functions).
220  size_t getNumArgs() const { return ArgumentTypes.size(); }
221 
222  /// Run the callback \p CB on each use and forget the use if the result is
223  /// true. The callback will be fed the function in which the use was
224  /// encountered as second argument.
226  function_ref<bool(Use &, Function &)> CB) {
227  for (Function *F : SCC)
228  foreachUse(CB, F);
229  }
230 
231  /// Run the callback \p CB on each use within the function \p F and forget
232  /// the use if the result is true.
233  void foreachUse(function_ref<bool(Use &, Function &)> CB, Function *F) {
234  SmallVector<unsigned, 8> ToBeDeleted;
235  ToBeDeleted.clear();
236 
237  unsigned Idx = 0;
238  UseVector &UV = getOrCreateUseVector(F);
239 
240  for (Use *U : UV) {
241  if (CB(*U, *F))
242  ToBeDeleted.push_back(Idx);
243  ++Idx;
244  }
245 
246  // Remove the to-be-deleted indices in reverse order as prior
247  // modifications will not modify the smaller indices.
248  while (!ToBeDeleted.empty()) {
249  unsigned Idx = ToBeDeleted.pop_back_val();
250  UV[Idx] = UV.back();
251  UV.pop_back();
252  }
253  }
254 
255  private:
256  /// Map from functions to all uses of this runtime function contained in
257  /// them.
259  };
260 
261  /// Initialize the ModuleSlice member based on \p SCC. ModuleSlices contains
262  /// (a subset of) all functions that we can look at during this SCC traversal.
263  /// This includes functions (transitively) called from the SCC and the
264  /// (transitive) callers of SCC functions. We also can look at a function if
265  /// there is a "reference edge", i.a., if the function somehow uses (!=calls)
266  /// a function in the SCC or a caller of a function in the SCC.
267  void initializeModuleSlice(SetVector<Function *> &SCC) {
268  ModuleSlice.insert(SCC.begin(), SCC.end());
269 
271  SmallVector<Function *, 16> Worklist(SCC.begin(), SCC.end());
272  while (!Worklist.empty()) {
273  Function *F = Worklist.pop_back_val();
274  ModuleSlice.insert(F);
275 
276  for (Instruction &I : instructions(*F))
277  if (auto *CB = dyn_cast<CallBase>(&I))
278  if (Function *Callee = CB->getCalledFunction())
279  if (Seen.insert(Callee).second)
280  Worklist.push_back(Callee);
281  }
282 
283  Seen.clear();
284  Worklist.append(SCC.begin(), SCC.end());
285  while (!Worklist.empty()) {
286  Function *F = Worklist.pop_back_val();
287  ModuleSlice.insert(F);
288 
289  // Traverse all transitive uses.
290  foreachUse(*F, [&](Use &U) {
291  if (auto *UsrI = dyn_cast<Instruction>(U.getUser()))
292  if (Seen.insert(UsrI->getFunction()).second)
293  Worklist.push_back(UsrI->getFunction());
294  });
295  }
296  }
297 
298  /// The slice of the module we are allowed to look at.
299  SmallPtrSet<Function *, 8> ModuleSlice;
300 
301  /// An OpenMP-IR-Builder instance
302  OpenMPIRBuilder OMPBuilder;
303 
304  /// Map from runtime function kind to the runtime function description.
305  EnumeratedArray<RuntimeFunctionInfo, RuntimeFunction,
306  RuntimeFunction::OMPRTL___last>
307  RFIs;
308 
309  /// Map from ICV kind to the ICV description.
310  EnumeratedArray<InternalControlVarInfo, InternalControlVar,
311  InternalControlVar::ICV___last>
312  ICVs;
313 
314  /// Helper to initialize all internal control variable information for those
315  /// defined in OMPKinds.def.
316  void initializeInternalControlVars() {
317 #define ICV_RT_SET(_Name, RTL) \
318  { \
319  auto &ICV = ICVs[_Name]; \
320  ICV.Setter = RTL; \
321  }
322 #define ICV_RT_GET(Name, RTL) \
323  { \
324  auto &ICV = ICVs[Name]; \
325  ICV.Getter = RTL; \
326  }
327 #define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \
328  { \
329  auto &ICV = ICVs[Enum]; \
330  ICV.Name = _Name; \
331  ICV.Kind = Enum; \
332  ICV.InitKind = Init; \
333  ICV.EnvVarName = _EnvVarName; \
334  switch (ICV.InitKind) { \
335  case ICV_IMPLEMENTATION_DEFINED: \
336  ICV.InitValue = nullptr; \
337  break; \
338  case ICV_ZERO: \
339  ICV.InitValue = ConstantInt::get( \
340  Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \
341  break; \
342  case ICV_FALSE: \
343  ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \
344  break; \
345  case ICV_LAST: \
346  break; \
347  } \
348  }
349 #include "llvm/Frontend/OpenMP/OMPKinds.def"
350  }
351 
352  /// Returns true if the function declaration \p F matches the runtime
353  /// function types, that is, return type \p RTFRetType, and argument types
354  /// \p RTFArgTypes.
355  static bool declMatchesRTFTypes(Function *F, Type *RTFRetType,
356  SmallVector<Type *, 8> &RTFArgTypes) {
357  // TODO: We should output information to the user (under debug output
358  // and via remarks).
359 
360  if (!F)
361  return false;
362  if (F->getReturnType() != RTFRetType)
363  return false;
364  if (F->arg_size() != RTFArgTypes.size())
365  return false;
366 
367  auto RTFTyIt = RTFArgTypes.begin();
368  for (Argument &Arg : F->args()) {
369  if (Arg.getType() != *RTFTyIt)
370  return false;
371 
372  ++RTFTyIt;
373  }
374 
375  return true;
376  }
377 
378  // Helper to collect all uses of the declaration in the UsesMap.
379  unsigned collectUses(RuntimeFunctionInfo &RFI, bool CollectStats = true) {
380  unsigned NumUses = 0;
381  if (!RFI.Declaration)
382  return NumUses;
383  OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration);
384 
385  if (CollectStats) {
386  NumOpenMPRuntimeFunctionsIdentified += 1;
387  NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
388  }
389 
390  // TODO: We directly convert uses into proper calls and unknown uses.
391  for (Use &U : RFI.Declaration->uses()) {
392  if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) {
393  if (ModuleSlice.count(UserI->getFunction())) {
394  RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
395  ++NumUses;
396  }
397  } else {
398  RFI.getOrCreateUseVector(nullptr).push_back(&U);
399  ++NumUses;
400  }
401  }
402  return NumUses;
403  }
404 
405  // Helper function to recollect uses of all runtime functions.
406  void recollectUses() {
407  for (int Idx = 0; Idx < RFIs.size(); ++Idx) {
408  auto &RFI = RFIs[static_cast<RuntimeFunction>(Idx)];
409  RFI.clearUsesMap();
410  collectUses(RFI, /*CollectStats*/ false);
411  }
412  }
413 
414  /// Helper to initialize all runtime function information for those defined
415  /// in OpenMPKinds.def.
416  void initializeRuntimeFunctions() {
417  Module &M = *((*ModuleSlice.begin())->getParent());
418 
419  // Helper macros for handling __VA_ARGS__ in OMP_RTL
420 #define OMP_TYPE(VarName, ...) \
421  Type *VarName = OMPBuilder.VarName; \
422  (void)VarName;
423 
424 #define OMP_ARRAY_TYPE(VarName, ...) \
425  ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \
426  (void)VarName##Ty; \
427  PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \
428  (void)VarName##PtrTy;
429 
430 #define OMP_FUNCTION_TYPE(VarName, ...) \
431  FunctionType *VarName = OMPBuilder.VarName; \
432  (void)VarName; \
433  PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
434  (void)VarName##Ptr;
435 
436 #define OMP_STRUCT_TYPE(VarName, ...) \
437  StructType *VarName = OMPBuilder.VarName; \
438  (void)VarName; \
439  PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
440  (void)VarName##Ptr;
441 
442 #define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \
443  { \
444  SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \
445  Function *F = M.getFunction(_Name); \
446  if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \
447  auto &RFI = RFIs[_Enum]; \
448  RFI.Kind = _Enum; \
449  RFI.Name = _Name; \
450  RFI.IsVarArg = _IsVarArg; \
451  RFI.ReturnType = OMPBuilder._ReturnType; \
452  RFI.ArgumentTypes = std::move(ArgsTypes); \
453  RFI.Declaration = F; \
454  unsigned NumUses = collectUses(RFI); \
455  (void)NumUses; \
456  LLVM_DEBUG({ \
457  dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \
458  << " found\n"; \
459  if (RFI.Declaration) \
460  dbgs() << TAG << "-> got " << NumUses << " uses in " \
461  << RFI.getNumFunctionsWithUses() \
462  << " different functions.\n"; \
463  }); \
464  } \
465  }
466 #include "llvm/Frontend/OpenMP/OMPKinds.def"
467 
468  // TODO: We should attach the attributes defined in OMPKinds.def.
469  }
470 
471  /// Collection of known kernels (\see Kernel) in the module.
473 };
474 
475 struct OpenMPOpt {
476 
477  using OptimizationRemarkGetter =
479 
480  OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater,
481  OptimizationRemarkGetter OREGetter,
482  OMPInformationCache &OMPInfoCache, Attributor &A)
483  : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater),
484  OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {}
485 
486  /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice.
487  bool run() {
488  if (SCC.empty())
489  return false;
490 
491  bool Changed = false;
492 
493  LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size()
494  << " functions in a slice with "
495  << OMPInfoCache.ModuleSlice.size() << " functions\n");
496 
497  if (PrintICVValues)
498  printICVs();
499  if (PrintOpenMPKernels)
500  printKernels();
501 
502  Changed |= rewriteDeviceCodeStateMachine();
503 
504  Changed |= runAttributor();
505 
506  // Recollect uses, in case Attributor deleted any.
507  OMPInfoCache.recollectUses();
508 
509  Changed |= deduplicateRuntimeCalls();
510  Changed |= deleteParallelRegions();
511 
512  return Changed;
513  }
514 
515  /// Print initial ICV values for testing.
516  /// FIXME: This should be done from the Attributor once it is added.
517  void printICVs() const {
518  InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel};
519 
520  for (Function *F : OMPInfoCache.ModuleSlice) {
521  for (auto ICV : ICVs) {
522  auto ICVInfo = OMPInfoCache.ICVs[ICV];
523  auto Remark = [&](OptimizationRemark OR) {
524  return OR << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name)
525  << " Value: "
526  << (ICVInfo.InitValue
527  ? ICVInfo.InitValue->getValue().toString(10, true)
528  : "IMPLEMENTATION_DEFINED");
529  };
530 
531  emitRemarkOnFunction(F, "OpenMPICVTracker", Remark);
532  }
533  }
534  }
535 
536  /// Print OpenMP GPU kernels for testing.
537  void printKernels() const {
538  for (Function *F : SCC) {
539  if (!OMPInfoCache.Kernels.count(F))
540  continue;
541 
542  auto Remark = [&](OptimizationRemark OR) {
543  return OR << "OpenMP GPU kernel "
544  << ore::NV("OpenMPGPUKernel", F->getName()) << "\n";
545  };
546 
547  emitRemarkOnFunction(F, "OpenMPGPU", Remark);
548  }
549  }
550 
551  /// Return the call if \p U is a callee use in a regular call. If \p RFI is
552  /// given it has to be the callee or a nullptr is returned.
553  static CallInst *getCallIfRegularCall(
554  Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
555  CallInst *CI = dyn_cast<CallInst>(U.getUser());
556  if (CI && CI->isCallee(&U) && !CI->hasOperandBundles() &&
557  (!RFI || CI->getCalledFunction() == RFI->Declaration))
558  return CI;
559  return nullptr;
560  }
561 
562  /// Return the call if \p V is a regular call. If \p RFI is given it has to be
563  /// the callee or a nullptr is returned.
564  static CallInst *getCallIfRegularCall(
565  Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
566  CallInst *CI = dyn_cast<CallInst>(&V);
567  if (CI && !CI->hasOperandBundles() &&
568  (!RFI || CI->getCalledFunction() == RFI->Declaration))
569  return CI;
570  return nullptr;
571  }
572 
573 private:
574  /// Try to delete parallel regions if possible.
575  bool deleteParallelRegions() {
576  const unsigned CallbackCalleeOperand = 2;
577 
578  OMPInformationCache::RuntimeFunctionInfo &RFI =
579  OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
580 
581  if (!RFI.Declaration)
582  return false;
583 
584  bool Changed = false;
585  auto DeleteCallCB = [&](Use &U, Function &) {
586  CallInst *CI = getCallIfRegularCall(U);
587  if (!CI)
588  return false;
589  auto *Fn = dyn_cast<Function>(
590  CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts());
591  if (!Fn)
592  return false;
593  if (!Fn->onlyReadsMemory())
594  return false;
595  if (!Fn->hasFnAttribute(Attribute::WillReturn))
596  return false;
597 
598  LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in "
599  << CI->getCaller()->getName() << "\n");
600 
601  auto Remark = [&](OptimizationRemark OR) {
602  return OR << "Parallel region in "
603  << ore::NV("OpenMPParallelDelete", CI->getCaller()->getName())
604  << " deleted";
605  };
606  emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionDeletion",
607  Remark);
608 
609  CGUpdater.removeCallSite(*CI);
610  CI->eraseFromParent();
611  Changed = true;
612  ++NumOpenMPParallelRegionsDeleted;
613  return true;
614  };
615 
616  RFI.foreachUse(SCC, DeleteCallCB);
617 
618  return Changed;
619  }
620 
621  /// Try to eliminate runtime calls by reusing existing ones.
622  bool deduplicateRuntimeCalls() {
623  bool Changed = false;
624 
625  RuntimeFunction DeduplicableRuntimeCallIDs[] = {
626  OMPRTL_omp_get_num_threads,
627  OMPRTL_omp_in_parallel,
628  OMPRTL_omp_get_cancellation,
629  OMPRTL_omp_get_thread_limit,
630  OMPRTL_omp_get_supported_active_levels,
631  OMPRTL_omp_get_level,
632  OMPRTL_omp_get_ancestor_thread_num,
633  OMPRTL_omp_get_team_size,
634  OMPRTL_omp_get_active_level,
635  OMPRTL_omp_in_final,
636  OMPRTL_omp_get_proc_bind,
637  OMPRTL_omp_get_num_places,
638  OMPRTL_omp_get_num_procs,
639  OMPRTL_omp_get_place_num,
640  OMPRTL_omp_get_partition_num_places,
641  OMPRTL_omp_get_partition_place_nums};
642 
643  // Global-tid is handled separately.
645  collectGlobalThreadIdArguments(GTIdArgs);
646  LLVM_DEBUG(dbgs() << TAG << "Found " << GTIdArgs.size()
647  << " global thread ID arguments\n");
648 
649  for (Function *F : SCC) {
650  for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
651  deduplicateRuntimeCalls(*F,
652  OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
653 
654  // __kmpc_global_thread_num is special as we can replace it with an
655  // argument in enough cases to make it worth trying.
656  Value *GTIdArg = nullptr;
657  for (Argument &Arg : F->args())
658  if (GTIdArgs.count(&Arg)) {
659  GTIdArg = &Arg;
660  break;
661  }
662  Changed |= deduplicateRuntimeCalls(
663  *F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);
664  }
665 
666  return Changed;
667  }
668 
669  static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent,
670  bool GlobalOnly, bool &SingleChoice) {
671  if (CurrentIdent == NextIdent)
672  return CurrentIdent;
673 
674  // TODO: Figure out how to actually combine multiple debug locations. For
675  // now we just keep an existing one if there is a single choice.
676  if (!GlobalOnly || isa<GlobalValue>(NextIdent)) {
677  SingleChoice = !CurrentIdent;
678  return NextIdent;
679  }
680  return nullptr;
681  }
682 
683  /// Return an `struct ident_t*` value that represents the ones used in the
684  /// calls of \p RFI inside of \p F. If \p GlobalOnly is true, we will not
685  /// return a local `struct ident_t*`. For now, if we cannot find a suitable
686  /// return value we create one from scratch. We also do not yet combine
687  /// information, e.g., the source locations, see combinedIdentStruct.
688  Value *
689  getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
690  Function &F, bool GlobalOnly) {
691  bool SingleChoice = true;
692  Value *Ident = nullptr;
693  auto CombineIdentStruct = [&](Use &U, Function &Caller) {
694  CallInst *CI = getCallIfRegularCall(U, &RFI);
695  if (!CI || &F != &Caller)
696  return false;
697  Ident = combinedIdentStruct(Ident, CI->getArgOperand(0),
698  /* GlobalOnly */ true, SingleChoice);
699  return false;
700  };
701  RFI.foreachUse(SCC, CombineIdentStruct);
702 
703  if (!Ident || !SingleChoice) {
704  // The IRBuilder uses the insertion block to get to the module, this is
705  // unfortunate but we work around it for now.
706  if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
707  OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy(
708  &F.getEntryBlock(), F.getEntryBlock().begin()));
709  // Create a fallback location if non was found.
710  // TODO: Use the debug locations of the calls instead.
711  Constant *Loc = OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr();
712  Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc);
713  }
714  return Ident;
715  }
716 
717  /// Try to eliminate calls of \p RFI in \p F by reusing an existing one or
718  /// \p ReplVal if given.
719  bool deduplicateRuntimeCalls(Function &F,
720  OMPInformationCache::RuntimeFunctionInfo &RFI,
721  Value *ReplVal = nullptr) {
722  auto *UV = RFI.getUseVector(F);
723  if (!UV || UV->size() + (ReplVal != nullptr) < 2)
724  return false;
725 
726  LLVM_DEBUG(
727  dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Name
728  << (ReplVal ? " with an existing value\n" : "\n") << "\n");
729 
730  assert((!ReplVal || (isa<Argument>(ReplVal) &&
731  cast<Argument>(ReplVal)->getParent() == &F)) &&
732  "Unexpected replacement value!");
733 
734  // TODO: Use dominance to find a good position instead.
735  auto CanBeMoved = [this](CallBase &CB) {
736  unsigned NumArgs = CB.getNumArgOperands();
737  if (NumArgs == 0)
738  return true;
739  if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
740  return false;
741  for (unsigned u = 1; u < NumArgs; ++u)
742  if (isa<Instruction>(CB.getArgOperand(u)))
743  return false;
744  return true;
745  };
746 
747  if (!ReplVal) {
748  for (Use *U : *UV)
749  if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
750  if (!CanBeMoved(*CI))
751  continue;
752 
753  auto Remark = [&](OptimizationRemark OR) {
754  auto newLoc = &*F.getEntryBlock().getFirstInsertionPt();
755  return OR << "OpenMP runtime call "
756  << ore::NV("OpenMPOptRuntime", RFI.Name) << " moved to "
757  << ore::NV("OpenMPRuntimeMoves", newLoc->getDebugLoc());
758  };
759  emitRemark<OptimizationRemark>(CI, "OpenMPRuntimeCodeMotion", Remark);
760 
761  CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt());
762  ReplVal = CI;
763  break;
764  }
765  if (!ReplVal)
766  return false;
767  }
768 
769  // If we use a call as a replacement value we need to make sure the ident is
770  // valid at the new location. For now we just pick a global one, either
771  // existing and used by one of the calls, or created from scratch.
772  if (CallBase *CI = dyn_cast<CallBase>(ReplVal)) {
773  if (CI->getNumArgOperands() > 0 &&
774  CI->getArgOperand(0)->getType() == OMPInfoCache.OMPBuilder.IdentPtr) {
775  Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F,
776  /* GlobalOnly */ true);
777  CI->setArgOperand(0, Ident);
778  }
779  }
780 
781  bool Changed = false;
782  auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) {
783  CallInst *CI = getCallIfRegularCall(U, &RFI);
784  if (!CI || CI == ReplVal || &F != &Caller)
785  return false;
786  assert(CI->getCaller() == &F && "Unexpected call!");
787 
788  auto Remark = [&](OptimizationRemark OR) {
789  return OR << "OpenMP runtime call "
790  << ore::NV("OpenMPOptRuntime", RFI.Name) << " deduplicated";
791  };
792  emitRemark<OptimizationRemark>(CI, "OpenMPRuntimeDeduplicated", Remark);
793 
794  CGUpdater.removeCallSite(*CI);
795  CI->replaceAllUsesWith(ReplVal);
796  CI->eraseFromParent();
797  ++NumOpenMPRuntimeCallsDeduplicated;
798  Changed = true;
799  return true;
800  };
801  RFI.foreachUse(SCC, ReplaceAndDeleteCB);
802 
803  return Changed;
804  }
805 
806  /// Collect arguments that represent the global thread id in \p GTIdArgs.
807  void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> &GTIdArgs) {
808  // TODO: Below we basically perform a fixpoint iteration with a pessimistic
809  // initialization. We could define an AbstractAttribute instead and
810  // run the Attributor here once it can be run as an SCC pass.
811 
812  // Helper to check the argument \p ArgNo at all call sites of \p F for
813  // a GTId.
814  auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) {
815  if (!F.hasLocalLinkage())
816  return false;
817  for (Use &U : F.uses()) {
818  if (CallInst *CI = getCallIfRegularCall(U)) {
819  Value *ArgOp = CI->getArgOperand(ArgNo);
820  if (CI == &RefCI || GTIdArgs.count(ArgOp) ||
821  getCallIfRegularCall(
822  *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))
823  continue;
824  }
825  return false;
826  }
827  return true;
828  };
829 
830  // Helper to identify uses of a GTId as GTId arguments.
831  auto AddUserArgs = [&](Value &GTId) {
832  for (Use &U : GTId.uses())
833  if (CallInst *CI = dyn_cast<CallInst>(U.getUser()))
834  if (CI->isArgOperand(&U))
835  if (Function *Callee = CI->getCalledFunction())
836  if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI))
837  GTIdArgs.insert(Callee->getArg(U.getOperandNo()));
838  };
839 
840  // The argument users of __kmpc_global_thread_num calls are GTIds.
841  OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
842  OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];
843 
844  GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &F) {
845  if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))
846  AddUserArgs(*CI);
847  return false;
848  });
849 
850  // Transitively search for more arguments by looking at the users of the
851  // ones we know already. During the search the GTIdArgs vector is extended
852  // so we cannot cache the size nor can we use a range based for.
853  for (unsigned u = 0; u < GTIdArgs.size(); ++u)
854  AddUserArgs(*GTIdArgs[u]);
855  }
856 
857  /// Kernel (=GPU) optimizations and utility functions
858  ///
859  ///{{
860 
861  /// Check if \p F is a kernel, hence entry point for target offloading.
862  bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); }
863 
864  /// Cache to remember the unique kernel for a function.
865  DenseMap<Function *, Optional<Kernel>> UniqueKernelMap;
866 
867  /// Find the unique kernel that will execute \p F, if any.
868  Kernel getUniqueKernelFor(Function &F);
869 
870  /// Find the unique kernel that will execute \p I, if any.
871  Kernel getUniqueKernelFor(Instruction &I) {
872  return getUniqueKernelFor(*I.getFunction());
873  }
874 
875  /// Rewrite the device (=GPU) code state machine create in non-SPMD mode in
876  /// the cases we can avoid taking the address of a function.
877  bool rewriteDeviceCodeStateMachine();
878 
879  ///
880  ///}}
881 
882  /// Emit a remark generically
883  ///
884  /// This template function can be used to generically emit a remark. The
885  /// RemarkKind should be one of the following:
886  /// - OptimizationRemark to indicate a successful optimization attempt
887  /// - OptimizationRemarkMissed to report a failed optimization attempt
888  /// - OptimizationRemarkAnalysis to provide additional information about an
889  /// optimization attempt
890  ///
891  /// The remark is built using a callback function provided by the caller that
892  /// takes a RemarkKind as input and returns a RemarkKind.
893  template <typename RemarkKind,
894  typename RemarkCallBack = function_ref<RemarkKind(RemarkKind &&)>>
895  void emitRemark(Instruction *Inst, StringRef RemarkName,
896  RemarkCallBack &&RemarkCB) const {
897  Function *F = Inst->getParent()->getParent();
898  auto &ORE = OREGetter(F);
899 
900  ORE.emit(
901  [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, Inst)); });
902  }
903 
904  /// Emit a remark on a function. Since only OptimizationRemark is supporting
905  /// this, it can't be made generic.
906  void
907  emitRemarkOnFunction(Function *F, StringRef RemarkName,
909  &&RemarkCB) const {
910  auto &ORE = OREGetter(F);
911 
912  ORE.emit([&]() {
913  return RemarkCB(OptimizationRemark(DEBUG_TYPE, RemarkName, F));
914  });
915  }
916 
917  /// The underlying module.
918  Module &M;
919 
920  /// The SCC we are operating on.
922 
923  /// Callback to update the call graph, the first argument is a removed call,
924  /// the second an optional replacement call.
925  CallGraphUpdater &CGUpdater;
926 
927  /// Callback to get an OptimizationRemarkEmitter from a Function *
928  OptimizationRemarkGetter OREGetter;
929 
930  /// OpenMP-specific information cache. Also Used for Attributor runs.
931  OMPInformationCache &OMPInfoCache;
932 
933  /// Attributor instance.
934  Attributor &A;
935 
936  /// Helper function to run Attributor on SCC.
937  bool runAttributor() {
938  if (SCC.empty())
939  return false;
940 
941  registerAAs();
942 
943  ChangeStatus Changed = A.run();
944 
945  LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size()
946  << " functions, result: " << Changed << ".\n");
947 
948  return Changed == ChangeStatus::CHANGED;
949  }
950 
951  /// Populate the Attributor with abstract attribute opportunities in the
952  /// function.
953  void registerAAs() {
954  for (Function *F : SCC) {
955  if (F->isDeclaration())
956  continue;
957 
958  A.getOrCreateAAFor<AAICVTracker>(IRPosition::function(*F));
959  }
960  }
961 };
962 
963 Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
964  if (!OMPInfoCache.ModuleSlice.count(&F))
965  return nullptr;
966 
967  // Use a scope to keep the lifetime of the CachedKernel short.
968  {
969  Optional<Kernel> &CachedKernel = UniqueKernelMap[&F];
970  if (CachedKernel)
971  return *CachedKernel;
972 
973  // TODO: We should use an AA to create an (optimistic and callback
974  // call-aware) call graph. For now we stick to simple patterns that
975  // are less powerful, basically the worst fixpoint.
976  if (isKernel(F)) {
977  CachedKernel = Kernel(&F);
978  return *CachedKernel;
979  }
980 
981  CachedKernel = nullptr;
982  if (!F.hasLocalLinkage())
983  return nullptr;
984  }
985 
986  auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel {
987  if (auto *Cmp = dyn_cast<ICmpInst>(U.getUser())) {
988  // Allow use in equality comparisons.
989  if (Cmp->isEquality())
990  return getUniqueKernelFor(*Cmp);
991  return nullptr;
992  }
993  if (auto *CB = dyn_cast<CallBase>(U.getUser())) {
994  // Allow direct calls.
995  if (CB->isCallee(&U))
996  return getUniqueKernelFor(*CB);
997  // Allow the use in __kmpc_kernel_prepare_parallel calls.
998  if (Function *Callee = CB->getCalledFunction())
999  if (Callee->getName() == "__kmpc_kernel_prepare_parallel")
1000  return getUniqueKernelFor(*CB);
1001  return nullptr;
1002  }
1003  // Disallow every other use.
1004  return nullptr;
1005  };
1006 
1007  // TODO: In the future we want to track more than just a unique kernel.
1008  SmallPtrSet<Kernel, 2> PotentialKernels;
1009  foreachUse(F, [&](const Use &U) {
1010  PotentialKernels.insert(GetUniqueKernelForUse(U));
1011  });
1012 
1013  Kernel K = nullptr;
1014  if (PotentialKernels.size() == 1)
1015  K = *PotentialKernels.begin();
1016 
1017  // Cache the result.
1018  UniqueKernelMap[&F] = K;
1019 
1020  return K;
1021 }
1022 
1023 bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
1024  OMPInformationCache::RuntimeFunctionInfo &KernelPrepareParallelRFI =
1025  OMPInfoCache.RFIs[OMPRTL___kmpc_kernel_prepare_parallel];
1026 
1027  bool Changed = false;
1028  if (!KernelPrepareParallelRFI)
1029  return Changed;
1030 
1031  for (Function *F : SCC) {
1032 
1033  // Check if the function is uses in a __kmpc_kernel_prepare_parallel call at
1034  // all.
1035  bool UnknownUse = false;
1036  bool KernelPrepareUse = false;
1037  unsigned NumDirectCalls = 0;
1038 
1039  SmallVector<Use *, 2> ToBeReplacedStateMachineUses;
1040  foreachUse(*F, [&](Use &U) {
1041  if (auto *CB = dyn_cast<CallBase>(U.getUser()))
1042  if (CB->isCallee(&U)) {
1043  ++NumDirectCalls;
1044  return;
1045  }
1046 
1047  if (isa<ICmpInst>(U.getUser())) {
1048  ToBeReplacedStateMachineUses.push_back(&U);
1049  return;
1050  }
1051  if (!KernelPrepareUse && OpenMPOpt::getCallIfRegularCall(
1052  *U.getUser(), &KernelPrepareParallelRFI)) {
1053  KernelPrepareUse = true;
1054  ToBeReplacedStateMachineUses.push_back(&U);
1055  return;
1056  }
1057  UnknownUse = true;
1058  });
1059 
1060  // Do not emit a remark if we haven't seen a __kmpc_kernel_prepare_parallel
1061  // use.
1062  if (!KernelPrepareUse)
1063  continue;
1064 
1065  {
1066  auto Remark = [&](OptimizationRemark OR) {
1067  return OR << "Found a parallel region that is called in a target "
1068  "region but not part of a combined target construct nor "
1069  "nesed inside a target construct without intermediate "
1070  "code. This can lead to excessive register usage for "
1071  "unrelated target regions in the same translation unit "
1072  "due to spurious call edges assumed by ptxas.";
1073  };
1074  emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark);
1075  }
1076 
1077  // If this ever hits, we should investigate.
1078  // TODO: Checking the number of uses is not a necessary restriction and
1079  // should be lifted.
1080  if (UnknownUse || NumDirectCalls != 1 ||
1081  ToBeReplacedStateMachineUses.size() != 2) {
1082  {
1083  auto Remark = [&](OptimizationRemark OR) {
1084  return OR << "Parallel region is used in "
1085  << (UnknownUse ? "unknown" : "unexpected")
1086  << " ways; will not attempt to rewrite the state machine.";
1087  };
1088  emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark);
1089  }
1090  continue;
1091  }
1092 
1093  // Even if we have __kmpc_kernel_prepare_parallel calls, we (for now) give
1094  // up if the function is not called from a unique kernel.
1095  Kernel K = getUniqueKernelFor(*F);
1096  if (!K) {
1097  {
1098  auto Remark = [&](OptimizationRemark OR) {
1099  return OR << "Parallel region is not known to be called from a "
1100  "unique single target region, maybe the surrounding "
1101  "function has external linkage?; will not attempt to "
1102  "rewrite the state machine use.";
1103  };
1104  emitRemarkOnFunction(F, "OpenMPParallelRegionInMultipleKernesl",
1105  Remark);
1106  }
1107  continue;
1108  }
1109 
1110  // We now know F is a parallel body function called only from the kernel K.
1111  // We also identified the state machine uses in which we replace the
1112  // function pointer by a new global symbol for identification purposes. This
1113  // ensures only direct calls to the function are left.
1114 
1115  {
1116  auto RemarkParalleRegion = [&](OptimizationRemark OR) {
1117  return OR << "Specialize parallel region that is only reached from a "
1118  "single target region to avoid spurious call edges and "
1119  "excessive register usage in other target regions. "
1120  "(parallel region ID: "
1121  << ore::NV("OpenMPParallelRegion", F->getName())
1122  << ", kernel ID: "
1123  << ore::NV("OpenMPTargetRegion", K->getName()) << ")";
1124  };
1125  emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD",
1126  RemarkParalleRegion);
1127  auto RemarkKernel = [&](OptimizationRemark OR) {
1128  return OR << "Target region containing the parallel region that is "
1129  "specialized. (parallel region ID: "
1130  << ore::NV("OpenMPParallelRegion", F->getName())
1131  << ", kernel ID: "
1132  << ore::NV("OpenMPTargetRegion", K->getName()) << ")";
1133  };
1134  emitRemarkOnFunction(K, "OpenMPParallelRegionInNonSPMD", RemarkKernel);
1135  }
1136 
1137  Module &M = *F->getParent();
1138  Type *Int8Ty = Type::getInt8Ty(M.getContext());
1139 
1140  auto *ID = new GlobalVariable(
1141  M, Int8Ty, /* isConstant */ true, GlobalValue::PrivateLinkage,
1142  UndefValue::get(Int8Ty), F->getName() + ".ID");
1143 
1144  for (Use *U : ToBeReplacedStateMachineUses)
1145  U->set(ConstantExpr::getBitCast(ID, U->get()->getType()));
1146 
1147  ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
1148 
1149  Changed = true;
1150  }
1151 
1152  return Changed;
1153 }
1154 
1155 /// Abstract Attribute for tracking ICV values.
1156 struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {
1158  AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1159 
1160  /// Returns true if value is assumed to be tracked.
1161  bool isAssumedTracked() const { return getAssumed(); }
1162 
1163  /// Returns true if value is known to be tracked.
1164  bool isKnownTracked() const { return getAssumed(); }
1165 
1166  /// Create an abstract attribute biew for the position \p IRP.
1167  static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A);
1168 
1169  /// Return the value with which \p I can be replaced for specific \p ICV.
1170  virtual Value *getReplacementValue(InternalControlVar ICV,
1171  const Instruction *I, Attributor &A) = 0;
1172 
1173  /// See AbstractAttribute::getName()
1174  const std::string getName() const override { return "AAICVTracker"; }
1175 
1176  /// See AbstractAttribute::getIdAddr()
1177  const char *getIdAddr() const override { return &ID; }
1178 
1179  /// This function should return true if the type of the \p AA is AAICVTracker
1180  static bool classof(const AbstractAttribute *AA) {
1181  return (AA->getIdAddr() == &ID);
1182  }
1183 
1184  static const char ID;
1185 };
1186 
1187 struct AAICVTrackerFunction : public AAICVTracker {
1188  AAICVTrackerFunction(const IRPosition &IRP, Attributor &A)
1189  : AAICVTracker(IRP, A) {}
1190 
1191  // FIXME: come up with better string.
1192  const std::string getAsStr() const override { return "ICVTracker"; }
1193 
1194  // FIXME: come up with some stats.
1195  void trackStatistics() const override {}
1196 
1197  /// TODO: decide whether to deduplicate here, or use current
1198  /// deduplicateRuntimeCalls function.
1199  ChangeStatus manifest(Attributor &A) override {
1201 
1202  for (InternalControlVar &ICV : TrackableICVs)
1203  if (deduplicateICVGetters(ICV, A))
1204  Changed = ChangeStatus::CHANGED;
1205 
1206  return Changed;
1207  }
1208 
1209  bool deduplicateICVGetters(InternalControlVar &ICV, Attributor &A) {
1210  auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
1211  auto &ICVInfo = OMPInfoCache.ICVs[ICV];
1212  auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
1213 
1214  bool Changed = false;
1215 
1216  auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) {
1217  CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
1218  Instruction *UserI = cast<Instruction>(U.getUser());
1219  Value *ReplVal = getReplacementValue(ICV, UserI, A);
1220 
1221  if (!ReplVal || !CI)
1222  return false;
1223 
1224  A.removeCallSite(CI);
1225  CI->replaceAllUsesWith(ReplVal);
1226  CI->eraseFromParent();
1227  Changed = true;
1228  return true;
1229  };
1230 
1231  GetterRFI.foreachUse(ReplaceAndDeleteCB, getAnchorScope());
1232  return Changed;
1233  }
1234 
1235  // Map of ICV to their values at specific program point.
1237  InternalControlVar::ICV___last>
1238  ICVValuesMap;
1239 
1240  // Currently only nthreads is being tracked.
1241  // this array will only grow with time.
1242  InternalControlVar TrackableICVs[1] = {ICV_nthreads};
1243 
1244  ChangeStatus updateImpl(Attributor &A) override {
1246 
1247  Function *F = getAnchorScope();
1248 
1249  auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
1250 
1251  for (InternalControlVar ICV : TrackableICVs) {
1252  auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
1253 
1254  auto TrackValues = [&](Use &U, Function &) {
1255  CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
1256  if (!CI)
1257  return false;
1258 
1259  // FIXME: handle setters with more that 1 arguments.
1260  /// Track new value.
1261  if (ICVValuesMap[ICV].insert(ICVValue(CI, CI->getArgOperand(0))))
1262  HasChanged = ChangeStatus::CHANGED;
1263 
1264  return false;
1265  };
1266 
1267  SetterRFI.foreachUse(TrackValues, F);
1268  }
1269 
1270  return HasChanged;
1271  }
1272 
1273  /// Return the value with which \p I can be replaced for specific \p ICV.
1274  Value *getReplacementValue(InternalControlVar ICV, const Instruction *I,
1275  Attributor &A) override {
1276  const BasicBlock *CurrBB = I->getParent();
1277 
1278  auto &ValuesSet = ICVValuesMap[ICV];
1279  auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
1280  auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
1281 
1282  for (const auto &ICVVal : ValuesSet) {
1283  if (CurrBB == ICVVal.Inst->getParent()) {
1284  if (!ICVVal.Inst->comesBefore(I))
1285  continue;
1286 
1287  // both instructions are in the same BB and at \p I we know the ICV
1288  // value.
1289  while (I != ICVVal.Inst) {
1290  // we don't yet know if a call might update an ICV.
1291  // TODO: check callsite AA for value.
1292  if (const auto *CB = dyn_cast<CallBase>(I))
1293  if (CB->getCalledFunction() != GetterRFI.Declaration)
1294  return nullptr;
1295 
1296  I = I->getPrevNode();
1297  }
1298 
1299  // No call in between, return the value.
1300  return ICVVal.TrackedValue;
1301  }
1302  }
1303 
1304  // No value was tracked.
1305  return nullptr;
1306  }
1307 };
1308 } // namespace
1309 
1310 const char AAICVTracker::ID = 0;
1311 
1312 AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP,
1313  Attributor &A) {
1314  AAICVTracker *AA = nullptr;
1315  switch (IRP.getPositionKind()) {
1317  case IRPosition::IRP_FLOAT:
1323  llvm_unreachable("ICVTracker can only be created for function position!");
1325  AA = new (A.Allocator) AAICVTrackerFunction(IRP, A);
1326  break;
1327  }
1328 
1329  return *AA;
1330 }
1331 
1334  LazyCallGraph &CG, CGSCCUpdateResult &UR) {
1335  if (!containsOpenMP(*C.begin()->getFunction().getParent(), OMPInModule))
1336  return PreservedAnalyses::all();
1337 
1339  return PreservedAnalyses::all();
1340 
1342  for (LazyCallGraph::Node &N : C)
1343  SCC.push_back(&N.getFunction());
1344 
1345  if (SCC.empty())
1346  return PreservedAnalyses::all();
1347 
1349  AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
1350 
1351  AnalysisGetter AG(FAM);
1352 
1353  auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & {
1355  };
1356 
1357  CallGraphUpdater CGUpdater;
1358  CGUpdater.initialize(CG, C, AM, UR);
1359 
1360  SetVector<Function *> Functions(SCC.begin(), SCC.end());
1362  OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
1363  /*CGSCC*/ Functions, OMPInModule.getKernels());
1364 
1365  Attributor A(Functions, InfoCache, CGUpdater);
1366 
1367  // TODO: Compute the module slice we are allowed to look at.
1368  OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
1369  bool Changed = OMPOpt.run();
1370  if (Changed)
1371  return PreservedAnalyses::none();
1372 
1373  return PreservedAnalyses::all();
1374 }
1375 
1376 namespace {
1377 
1378 struct OpenMPOptLegacyPass : public CallGraphSCCPass {
1379  CallGraphUpdater CGUpdater;
1380  OpenMPInModule OMPInModule;
1381  static char ID;
1382 
1383  OpenMPOptLegacyPass() : CallGraphSCCPass(ID) {
1385  }
1386 
1387  void getAnalysisUsage(AnalysisUsage &AU) const override {
1389  }
1390 
1391  bool doInitialization(CallGraph &CG) override {
1392  // Disable the pass if there is no OpenMP (runtime call) in the module.
1393  containsOpenMP(CG.getModule(), OMPInModule);
1394  return false;
1395  }
1396 
1397  bool runOnSCC(CallGraphSCC &CGSCC) override {
1398  if (!containsOpenMP(CGSCC.getCallGraph().getModule(), OMPInModule))
1399  return false;
1400  if (DisableOpenMPOptimizations || skipSCC(CGSCC))
1401  return false;
1402 
1404  for (CallGraphNode *CGN : CGSCC)
1405  if (Function *Fn = CGN->getFunction())
1406  if (!Fn->isDeclaration())
1407  SCC.push_back(Fn);
1408 
1409  if (SCC.empty())
1410  return false;
1411 
1412  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
1413  CGUpdater.initialize(CG, CGSCC);
1414 
1415  // Maintain a map of functions to avoid rebuilding the ORE
1417  auto OREGetter = [&OREMap](Function *F) -> OptimizationRemarkEmitter & {
1418  std::unique_ptr<OptimizationRemarkEmitter> &ORE = OREMap[F];
1419  if (!ORE)
1420  ORE = std::make_unique<OptimizationRemarkEmitter>(F);
1421  return *ORE;
1422  };
1423 
1424  AnalysisGetter AG;
1425  SetVector<Function *> Functions(SCC.begin(), SCC.end());
1427  OMPInformationCache InfoCache(
1428  *(Functions.back()->getParent()), AG, Allocator,
1429  /*CGSCC*/ Functions, OMPInModule.getKernels());
1430 
1431  Attributor A(Functions, InfoCache, CGUpdater);
1432 
1433  // TODO: Compute the module slice we are allowed to look at.
1434  OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
1435  return OMPOpt.run();
1436  }
1437 
1438  bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); }
1439 };
1440 
1441 } // end anonymous namespace
1442 
1444 
1445  NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
1446  if (!MD)
1447  return;
1448 
1449  for (auto *Op : MD->operands()) {
1450  if (Op->getNumOperands() < 2)
1451  continue;
1452  MDString *KindID = dyn_cast<MDString>(Op->getOperand(1));
1453  if (!KindID || KindID->getString() != "kernel")
1454  continue;
1455 
1456  Function *KernelFn =
1457  mdconst::dyn_extract_or_null<Function>(Op->getOperand(0));
1458  if (!KernelFn)
1459  continue;
1460 
1461  ++NumOpenMPTargetRegionKernels;
1462 
1463  Kernels.insert(KernelFn);
1464  }
1465 }
1466 
1468  if (OMPInModule.isKnown())
1469  return OMPInModule;
1470 
1471  // MSVC doesn't like long if-else chains for some reason and instead just
1472  // issues an error. Work around it..
1473  do {
1474 #define OMP_RTL(_Enum, _Name, ...) \
1475  if (M.getFunction(_Name)) { \
1476  OMPInModule = true; \
1477  break; \
1478  }
1479 #include "llvm/Frontend/OpenMP/OMPKinds.def"
1480  } while (false);
1481 
1482  // Identify kernels once. TODO: We should split the OMPInformationCache into a
1483  // module and an SCC part. The kernel information, among other things, could
1484  // go into the module part.
1485  if (OMPInModule.isKnown() && OMPInModule) {
1486  OMPInModule.identifyKernels(M);
1487  return true;
1488  }
1489 
1490  return OMPInModule = false;
1491 }
1492 
1493 char OpenMPOptLegacyPass::ID = 0;
1494 
1495 INITIALIZE_PASS_BEGIN(OpenMPOptLegacyPass, "openmpopt",
1496  "OpenMP specific optimizations", false, false)
1498 INITIALIZE_PASS_END(OpenMPOptLegacyPass, "openmpopt",
1499  "OpenMP specific optimizations", false, false)
1500 
1501 Pass *llvm::createOpenMPOptLegacyPass() { return new OpenMPOptLegacyPass(); }
Pass interface - Implemented by all &#39;passes&#39;.
Definition: Pass.h:77
const Function & getFunction() const
Definition: Function.h:135
uint64_t CallInst * C
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:1786
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks &#39;this&#39; from the containing basic block and deletes it.
Definition: Instruction.cpp:80
void initializeOpenMPOptLegacyPassPass(PassRegistry &)
iterator_range< use_iterator > uses()
Definition: Value.h:373
LLVM_NODISCARD std::enable_if_t< !is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type > dyn_cast(const Y &Val)
Definition: Casting.h:334
bool hasLocalLinkage() const
Definition: GlobalValue.h:445
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
Definition: OMPConstants.h:53
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
An attribute for a function argument.
Definition: Attributor.h:167
This class represents an incoming formal argument to a Function.
Definition: Argument.h:29
An attribute for the function return value.
Definition: Attributor.h:163
INITIALIZE_PASS_BEGIN(OpenMPOptLegacyPass, "openmpopt", "OpenMP specific optimizations", false, false) INITIALIZE_PASS_END(OpenMPOptLegacyPass
DiagnosticInfoOptimizationBase::Argument NV
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:769
SmallPtrSetImpl< Kernel > & getKernels()
Return the known kernels (=GPU entry points) in the module.
Definition: OpenMPOpt.h:37
This class represents lattice values for constants.
Definition: AllocatorList.h:23
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:77
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:67
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:69
amdgpu Simplify well known AMD library false FunctionCallee Value const Twine & Name
Function * getCaller()
Helper to get the caller (the parent function).
ChangeStatus
Simple enum classes that forces properties to be spelled out explicitly.
Definition: Attributor.h:133
static cl::opt< bool > DisableOpenMPOptimizations("openmp-opt-disable", cl::ZeroOrMore, cl::desc("Disable OpenMP specific optimizations."), cl::Hidden, cl::init(false))
This class represents a function call, abstracting a target machine&#39;s calling convention.
An efficient, type-erasing, non-owning reference to a callable.
Definition: STLExtras.h:176
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:56
NamedMDNode * getOrInsertNamedMetadata(StringRef Name)
Return the named MDNode in the module with the specified name.
Definition: Module.cpp:259
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition: OpenMPOpt.h:21
STATISTIC(NumFunctions, "Total number of functions")
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1100
F(f)
static void foreachUse(Function &F, CBTy CB, bool LookThroughConstantExprUses=true)
Apply CB to all uses of F.
Definition: OpenMPOpt.cpp:67
bool containsOpenMP(Module &M, OpenMPInModule &OMPInModule)
Helper to determine if M contains OpenMP (runtime calls).
Definition: OpenMPOpt.cpp:1467
Helper to tie a abstract state implementation to an abstract attribute.
Definition: Attributor.h:1909
OpenMP specific optimizations
Definition: OpenMPOpt.cpp:1498
Wrapper for FunctoinAnalysisManager.
Definition: Attributor.h:607
static ICVValue getTombstoneKey()
Definition: OpenMPOpt.cpp:104
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:92
A proxy from a FunctionAnalysisManager to an SCC.
A node in the call graph for a module.
Definition: CallGraph.h:174
void getAnalysisUsage(AnalysisUsage &Info) const override
getAnalysisUsage - For this class, we declare that we require and preserve the call graph...
Support structure for SCC passes to communicate updates the call graph back to the CGSCC pass manager...
Module & getModule() const
Returns the module the call graph corresponds to.
Definition: CallGraph.h:109
Wrapper to unify "old style" CallGraph and "new style" LazyCallGraph.
const AAType & getOrCreateAAFor(const IRPosition &IRP, const AbstractAttribute *QueryingAA=nullptr, bool TrackDependence=false, DepClassTy DepClass=DepClassTy::OPTIONAL, bool ForceUpdate=false)
The version of getAAFor that allows to omit a querying abstract attribute.
Definition: Attributor.h:880
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:343
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:289
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1259
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:207
A tuple of MDNodes.
Definition: Metadata.h:1330
void removeCallSite(CallBase &CS)
Remove the call site CS from the call graph.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
Definition: OMPIRBuilder.h:45
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:253
A Use represents the edge between a Value definition and its users.
Definition: Use.h:44
iterator begin() const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:43
const CallGraph & getCallGraph()
static StringRef getName(Value *V)
void identifyKernels(Module &M)
Identify kernels in the module and populate the Kernels set.
Definition: OpenMPOpt.cpp:1443
Value * TrackedValue
Definition: OpenMPOpt.cpp:88
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:244
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:141
static bool isEqual(const Function &Caller, const Function &Callee)
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:82
Instruction * Inst
Definition: OpenMPOpt.cpp:87
A lazily constructed view of the call graph of a module.
#define DEBUG_TYPE
Definition: OpenMPOpt.cpp:33
This file provides interfaces used to manipulate a call graph, regardless if it is a "old style" Call...
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:486
unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:40
iterator_range< op_iterator > operands()
Definition: Metadata.h:1422
A position that is not associated with a spot suitable for attributes.
Definition: Attributor.h:161
void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:210
ChangeStatus run()
Run the analyses until a fixpoint is reached or enforced (timeout).
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: PassManager.h:157
Helper to remember if the module contains OpenMP (runtime calls), to be used foremost with containsOp...
Definition: OpenMPOpt.h:25
StringRef getString() const
Definition: Metadata.cpp:463
static Constant * getBitCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2025
const BasicBlock & getEntryBlock() const
Definition: Function.h:689
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:434
Type * getReturnType() const
Returns the type of the ret val.
Definition: Function.h:170
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:154
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:241
The ModulePass which wraps up a CallGraph and the logic to build it.
Definition: CallGraph.h:344
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
ICVValue(Instruction *I, Value *Val)
Definition: OpenMPOpt.cpp:90
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
Allocate memory in an ever growing pool, as if by bump-pointer.
Definition: Allocator.h:67
This is an important base class in LLVM.
Definition: Constant.h:41
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364
Diagnostic information for applied optimization remarks.
Represent the analysis usage information of a pass.
The fixpoint analysis framework that orchestrates the attribute deduction.
Definition: Attributor.h:809
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
size_t arg_size() const
Definition: Function.h:753
A node in the call graph.
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:72
virtual const char * getIdAddr() const =0
This function should return the address of the ID of the AbstractAttribute.
An attribute for a call site return value.
Definition: Attributor.h:164
This file defines constans and helpers used when dealing with OpenMP.
Used in the streaming interface as the general argument type.
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1665
const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition: Value.cpp:593
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:160
Base struct for all "concrete attribute" deductions.
Definition: Attributor.h:2014
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
An attribute for a function (scope).
Definition: Attributor.h:165
size_type size() const
Definition: SmallPtrSet.h:92
Pass * createOpenMPOptLegacyPass()
createOpenMPOptLegacyPass - OpenMP specific optimizations.
Definition: OpenMPOpt.cpp:1501
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:73
static cl::opt< bool > PrintICVValues("openmp-print-icv-values", cl::init(false), cl::Hidden)
Basic Register Allocator
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:302
void removeCallSite(CallInst *CI)
Helper function to remove callsite.
Definition: Attributor.h:1054
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:439
An attribute for a call site argument.
Definition: Attributor.h:168
bool isCallee(Value::const_user_iterator UI) const
Determine whether the passed iterator points to the callee operand&#39;s Use.
Definition: InstrTypes.h:1322
This is the shared class of boolean and integer constants.
Definition: Constants.h:77
static cl::opt< bool > PrintOpenMPKernels("openmp-print-gpu-kernels", cl::init(false), cl::Hidden)
An interface to create LLVM-IR for OpenMP directives.
Definition: OMPIRBuilder.h:27
BumpPtrAllocator & Allocator
The allocator used to allocate memory, e.g. for AbstractAttributes.
Definition: Attributor.h:1317
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:883
Data structure to hold cached (LLVM-IR) information.
Definition: Attributor.h:634
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:420
An attribute for a call site (function scope).
Definition: Attributor.h:166
LLVM_READNONE bool isKernel(CallingConv::ID CC)
Helper struct to store tracked ICV values at specif instructions.
Definition: OpenMPOpt.cpp:86
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
amdgpu Simplify well known AMD library false FunctionCallee Callee
constexpr char Kernels[]
Key for HSA::Metadata::mKernels.
InternalControlVar
IDs for all Internal Control Variables (ICVs).
Definition: OMPConstants.h:34
The basic data container for the call graph of a Module of IR.
Definition: CallGraph.h:73
iterator begin() const
Definition: SmallPtrSet.h:392
InformationCache & getInfoCache()
Return the internal information cache.
Definition: Attributor.h:1009
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:270
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation.
Definition: InstrTypes.h:1314
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
#define I(x, y, z)
Definition: MD5.cpp:59
#define N
openmpopt
Definition: OpenMPOpt.cpp:1498
An invalid position.
Definition: Attributor.h:160
static bool isEqual(const ICVValue &LHS, const ICVValue &RHS)
Definition: OpenMPOpt.cpp:114
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:227
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:572
LLVM Value Representation.
Definition: Value.h:74
An SCC of the call graph.
CallGraphSCC - This is a single SCC that a CallGraphSCCPass is run on.
static ICVValue getEmptyKey()
Definition: OpenMPOpt.cpp:100
A vector that has set insertion semantics.
Definition: SetVector.h:40
static constexpr auto TAG
Definition: OpenMPOpt.cpp:60
static const Function * getParent(const Value *V)
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:57
inst_range instructions(Function *F)
Definition: InstIterator.h:133
A single uniqued string.
Definition: Metadata.h:602
A container for analyses that lazily runs them and caches their results.
static unsigned getHashValue(const ICVValue &ICVVal)
Definition: OpenMPOpt.cpp:108
#define LLVM_DEBUG(X)
Definition: Debug.h:122
void initialize(CallGraph &CG, CallGraphSCC &SCC)
Initializers for usage outside of a CGSCC pass, inside a CGSCC pass in the old and new pass manager (...
static unsigned combineHashValue(unsigned a, unsigned b)
Simplistic combination of 32-bit hash values into 32-bit hash values.
Definition: DenseMapInfo.h:29
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:184
The optimization diagnostic interface.
PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM, LazyCallGraph &CG, CGSCCUpdateResult &UR)
Definition: OpenMPOpt.cpp:1332
static const IRPosition function(const Function &F)
Create a position describing the function scope of F.
Definition: Attributor.h:186
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition: iterator.h:330
Kind getPositionKind() const
Return the associated position kind.
Definition: Attributor.h:364
iterator_range< arg_iterator > args()
Definition: Function.h:744
const BasicBlock * getParent() const
Definition: Instruction.h:94
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)