LLVM  12.0.0git
OpenMPOpt.cpp
Go to the documentation of this file.
1 //===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // OpenMP specific optimizations:
10 //
11 // - Deduplication of runtime calls, e.g., omp_get_thread_num.
12 //
13 //===----------------------------------------------------------------------===//
14 
16 
18 #include "llvm/ADT/Statistic.h"
25 #include "llvm/InitializePasses.h"
27 #include "llvm/Transforms/IPO.h"
32 
33 using namespace llvm;
34 using namespace omp;
35 
36 #define DEBUG_TYPE "openmp-opt"
37 
39  "openmp-opt-disable", cl::ZeroOrMore,
40  cl::desc("Disable OpenMP specific optimizations."), cl::Hidden,
41  cl::init(false));
42 
44  "openmp-opt-enable-merging", cl::ZeroOrMore,
45  cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden,
46  cl::init(false));
47 
48 static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
49  cl::Hidden);
50 static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
51  cl::init(false), cl::Hidden);
52 
54  "openmp-hide-memory-transfer-latency",
55  cl::desc("[WIP] Tries to hide the latency of host to device memory"
56  " transfers"),
57  cl::Hidden, cl::init(false));
58 
59 STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
60  "Number of OpenMP runtime calls deduplicated");
61 STATISTIC(NumOpenMPParallelRegionsDeleted,
62  "Number of OpenMP parallel regions deleted");
63 STATISTIC(NumOpenMPRuntimeFunctionsIdentified,
64  "Number of OpenMP runtime functions identified");
65 STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified,
66  "Number of OpenMP runtime function uses identified");
67 STATISTIC(NumOpenMPTargetRegionKernels,
68  "Number of OpenMP target region entry points (=kernels) identified");
69 STATISTIC(
70  NumOpenMPParallelRegionsReplacedInGPUStateMachine,
71  "Number of OpenMP parallel regions replaced with ID in GPU state machines");
72 STATISTIC(NumOpenMPParallelRegionsMerged,
73  "Number of OpenMP parallel regions merged");
74 
75 #if !defined(NDEBUG)
76 static constexpr auto TAG = "[" DEBUG_TYPE "]";
77 #endif
78 
79 namespace {
80 
81 struct AAICVTracker;
82 
83 /// OpenMP specific information. For now, stores RFIs and ICVs also needed for
84 /// Attributor runs.
85 struct OMPInformationCache : public InformationCache {
86  OMPInformationCache(Module &M, AnalysisGetter &AG,
89  : InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M),
90  Kernels(Kernels) {
91 
92  OMPBuilder.initialize();
93  initializeRuntimeFunctions();
94  initializeInternalControlVars();
95  }
96 
97  /// Generic information that describes an internal control variable.
98  struct InternalControlVarInfo {
99  /// The kind, as described by InternalControlVar enum.
101 
102  /// The name of the ICV.
103  StringRef Name;
104 
105  /// Environment variable associated with this ICV.
106  StringRef EnvVarName;
107 
108  /// Initial value kind.
109  ICVInitValue InitKind;
110 
111  /// Initial value.
112  ConstantInt *InitValue;
113 
114  /// Setter RTL function associated with this ICV.
115  RuntimeFunction Setter;
116 
117  /// Getter RTL function associated with this ICV.
118  RuntimeFunction Getter;
119 
120  /// RTL Function corresponding to the override clause of this ICV
122  };
123 
124  /// Generic information that describes a runtime function
125  struct RuntimeFunctionInfo {
126 
127  /// The kind, as described by the RuntimeFunction enum.
129 
130  /// The name of the function.
131  StringRef Name;
132 
133  /// Flag to indicate a variadic function.
134  bool IsVarArg;
135 
136  /// The return type of the function.
137  Type *ReturnType;
138 
139  /// The argument types of the function.
140  SmallVector<Type *, 8> ArgumentTypes;
141 
142  /// The declaration if available.
143  Function *Declaration = nullptr;
144 
145  /// Uses of this runtime function per function containing the use.
146  using UseVector = SmallVector<Use *, 16>;
147 
148  /// Clear UsesMap for runtime function.
149  void clearUsesMap() { UsesMap.clear(); }
150 
151  /// Boolean conversion that is true if the runtime function was found.
152  operator bool() const { return Declaration; }
153 
154  /// Return the vector of uses in function \p F.
155  UseVector &getOrCreateUseVector(Function *F) {
156  std::shared_ptr<UseVector> &UV = UsesMap[F];
157  if (!UV)
158  UV = std::make_shared<UseVector>();
159  return *UV;
160  }
161 
162  /// Return the vector of uses in function \p F or `nullptr` if there are
163  /// none.
164  const UseVector *getUseVector(Function &F) const {
165  auto I = UsesMap.find(&F);
166  if (I != UsesMap.end())
167  return I->second.get();
168  return nullptr;
169  }
170 
171  /// Return how many functions contain uses of this runtime function.
172  size_t getNumFunctionsWithUses() const { return UsesMap.size(); }
173 
174  /// Return the number of arguments (or the minimal number for variadic
175  /// functions).
176  size_t getNumArgs() const { return ArgumentTypes.size(); }
177 
178  /// Run the callback \p CB on each use and forget the use if the result is
179  /// true. The callback will be fed the function in which the use was
180  /// encountered as second argument.
181  void foreachUse(SmallVectorImpl<Function *> &SCC,
182  function_ref<bool(Use &, Function &)> CB) {
183  for (Function *F : SCC)
184  foreachUse(CB, F);
185  }
186 
187  /// Run the callback \p CB on each use within the function \p F and forget
188  /// the use if the result is true.
189  void foreachUse(function_ref<bool(Use &, Function &)> CB, Function *F) {
190  SmallVector<unsigned, 8> ToBeDeleted;
191  ToBeDeleted.clear();
192 
193  unsigned Idx = 0;
194  UseVector &UV = getOrCreateUseVector(F);
195 
196  for (Use *U : UV) {
197  if (CB(*U, *F))
198  ToBeDeleted.push_back(Idx);
199  ++Idx;
200  }
201 
202  // Remove the to-be-deleted indices in reverse order as prior
203  // modifications will not modify the smaller indices.
204  while (!ToBeDeleted.empty()) {
205  unsigned Idx = ToBeDeleted.pop_back_val();
206  UV[Idx] = UV.back();
207  UV.pop_back();
208  }
209  }
210 
211  private:
212  /// Map from functions to all uses of this runtime function contained in
213  /// them.
215  };
216 
217  /// An OpenMP-IR-Builder instance
218  OpenMPIRBuilder OMPBuilder;
219 
220  /// Map from runtime function kind to the runtime function description.
221  EnumeratedArray<RuntimeFunctionInfo, RuntimeFunction,
222  RuntimeFunction::OMPRTL___last>
223  RFIs;
224 
225  /// Map from ICV kind to the ICV description.
226  EnumeratedArray<InternalControlVarInfo, InternalControlVar,
227  InternalControlVar::ICV___last>
228  ICVs;
229 
230  /// Helper to initialize all internal control variable information for those
231  /// defined in OMPKinds.def.
232  void initializeInternalControlVars() {
233 #define ICV_RT_SET(_Name, RTL) \
234  { \
235  auto &ICV = ICVs[_Name]; \
236  ICV.Setter = RTL; \
237  }
238 #define ICV_RT_GET(Name, RTL) \
239  { \
240  auto &ICV = ICVs[Name]; \
241  ICV.Getter = RTL; \
242  }
243 #define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \
244  { \
245  auto &ICV = ICVs[Enum]; \
246  ICV.Name = _Name; \
247  ICV.Kind = Enum; \
248  ICV.InitKind = Init; \
249  ICV.EnvVarName = _EnvVarName; \
250  switch (ICV.InitKind) { \
251  case ICV_IMPLEMENTATION_DEFINED: \
252  ICV.InitValue = nullptr; \
253  break; \
254  case ICV_ZERO: \
255  ICV.InitValue = ConstantInt::get( \
256  Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \
257  break; \
258  case ICV_FALSE: \
259  ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \
260  break; \
261  case ICV_LAST: \
262  break; \
263  } \
264  }
265 #include "llvm/Frontend/OpenMP/OMPKinds.def"
266  }
267 
268  /// Returns true if the function declaration \p F matches the runtime
269  /// function types, that is, return type \p RTFRetType, and argument types
270  /// \p RTFArgTypes.
271  static bool declMatchesRTFTypes(Function *F, Type *RTFRetType,
272  SmallVector<Type *, 8> &RTFArgTypes) {
273  // TODO: We should output information to the user (under debug output
274  // and via remarks).
275 
276  if (!F)
277  return false;
278  if (F->getReturnType() != RTFRetType)
279  return false;
280  if (F->arg_size() != RTFArgTypes.size())
281  return false;
282 
283  auto RTFTyIt = RTFArgTypes.begin();
284  for (Argument &Arg : F->args()) {
285  if (Arg.getType() != *RTFTyIt)
286  return false;
287 
288  ++RTFTyIt;
289  }
290 
291  return true;
292  }
293 
294  // Helper to collect all uses of the declaration in the UsesMap.
295  unsigned collectUses(RuntimeFunctionInfo &RFI, bool CollectStats = true) {
296  unsigned NumUses = 0;
297  if (!RFI.Declaration)
298  return NumUses;
299  OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration);
300 
301  if (CollectStats) {
302  NumOpenMPRuntimeFunctionsIdentified += 1;
303  NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
304  }
305 
306  // TODO: We directly convert uses into proper calls and unknown uses.
307  for (Use &U : RFI.Declaration->uses()) {
308  if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) {
309  if (ModuleSlice.count(UserI->getFunction())) {
310  RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
311  ++NumUses;
312  }
313  } else {
314  RFI.getOrCreateUseVector(nullptr).push_back(&U);
315  ++NumUses;
316  }
317  }
318  return NumUses;
319  }
320 
321  // Helper function to recollect uses of a runtime function.
322  void recollectUsesForFunction(RuntimeFunction RTF) {
323  auto &RFI = RFIs[RTF];
324  RFI.clearUsesMap();
325  collectUses(RFI, /*CollectStats*/ false);
326  }
327 
328  // Helper function to recollect uses of all runtime functions.
329  void recollectUses() {
330  for (int Idx = 0; Idx < RFIs.size(); ++Idx)
331  recollectUsesForFunction(static_cast<RuntimeFunction>(Idx));
332  }
333 
334  /// Helper to initialize all runtime function information for those defined
335  /// in OpenMPKinds.def.
336  void initializeRuntimeFunctions() {
337  Module &M = *((*ModuleSlice.begin())->getParent());
338 
339  // Helper macros for handling __VA_ARGS__ in OMP_RTL
340 #define OMP_TYPE(VarName, ...) \
341  Type *VarName = OMPBuilder.VarName; \
342  (void)VarName;
343 
344 #define OMP_ARRAY_TYPE(VarName, ...) \
345  ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \
346  (void)VarName##Ty; \
347  PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \
348  (void)VarName##PtrTy;
349 
350 #define OMP_FUNCTION_TYPE(VarName, ...) \
351  FunctionType *VarName = OMPBuilder.VarName; \
352  (void)VarName; \
353  PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
354  (void)VarName##Ptr;
355 
356 #define OMP_STRUCT_TYPE(VarName, ...) \
357  StructType *VarName = OMPBuilder.VarName; \
358  (void)VarName; \
359  PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
360  (void)VarName##Ptr;
361 
362 #define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \
363  { \
364  SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \
365  Function *F = M.getFunction(_Name); \
366  if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \
367  auto &RFI = RFIs[_Enum]; \
368  RFI.Kind = _Enum; \
369  RFI.Name = _Name; \
370  RFI.IsVarArg = _IsVarArg; \
371  RFI.ReturnType = OMPBuilder._ReturnType; \
372  RFI.ArgumentTypes = std::move(ArgsTypes); \
373  RFI.Declaration = F; \
374  unsigned NumUses = collectUses(RFI); \
375  (void)NumUses; \
376  LLVM_DEBUG({ \
377  dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \
378  << " found\n"; \
379  if (RFI.Declaration) \
380  dbgs() << TAG << "-> got " << NumUses << " uses in " \
381  << RFI.getNumFunctionsWithUses() \
382  << " different functions.\n"; \
383  }); \
384  } \
385  }
386 #include "llvm/Frontend/OpenMP/OMPKinds.def"
387 
388  // TODO: We should attach the attributes defined in OMPKinds.def.
389  }
390 
391  /// Collection of known kernels (\see Kernel) in the module.
393 };
394 
395 /// Used to map the values physically (in the IR) stored in an offload
396 /// array, to a vector in memory.
397 struct OffloadArray {
398  /// Physical array (in the IR).
399  AllocaInst *Array = nullptr;
400  /// Mapped values.
401  SmallVector<Value *, 8> StoredValues;
402  /// Last stores made in the offload array.
403  SmallVector<StoreInst *, 8> LastAccesses;
404 
405  OffloadArray() = default;
406 
407  /// Initializes the OffloadArray with the values stored in \p Array before
408  /// instruction \p Before is reached. Returns false if the initialization
409  /// fails.
410  /// This MUST be used immediately after the construction of the object.
411  bool initialize(AllocaInst &Array, Instruction &Before) {
412  if (!Array.getAllocatedType()->isArrayTy())
413  return false;
414 
415  if (!getValues(Array, Before))
416  return false;
417 
418  this->Array = &Array;
419  return true;
420  }
421 
422  static const unsigned DeviceIDArgNum = 1;
423  static const unsigned BasePtrsArgNum = 3;
424  static const unsigned PtrsArgNum = 4;
425  static const unsigned SizesArgNum = 5;
426 
427 private:
428  /// Traverses the BasicBlock where \p Array is, collecting the stores made to
429  /// \p Array, leaving StoredValues with the values stored before the
430  /// instruction \p Before is reached.
431  bool getValues(AllocaInst &Array, Instruction &Before) {
432  // Initialize container.
433  const uint64_t NumValues = Array.getAllocatedType()->getArrayNumElements();
434  StoredValues.assign(NumValues, nullptr);
435  LastAccesses.assign(NumValues, nullptr);
436 
437  // TODO: This assumes the instruction \p Before is in the same
438  // BasicBlock as Array. Make it general, for any control flow graph.
439  BasicBlock *BB = Array.getParent();
440  if (BB != Before.getParent())
441  return false;
442 
443  const DataLayout &DL = Array.getModule()->getDataLayout();
444  const unsigned int PointerSize = DL.getPointerSize();
445 
446  for (Instruction &I : *BB) {
447  if (&I == &Before)
448  break;
449 
450  if (!isa<StoreInst>(&I))
451  continue;
452 
453  auto *S = cast<StoreInst>(&I);
454  int64_t Offset = -1;
455  auto *Dst =
456  GetPointerBaseWithConstantOffset(S->getPointerOperand(), Offset, DL);
457  if (Dst == &Array) {
458  int64_t Idx = Offset / PointerSize;
459  StoredValues[Idx] = getUnderlyingObject(S->getValueOperand());
460  LastAccesses[Idx] = S;
461  }
462  }
463 
464  return isFilled();
465  }
466 
467  /// Returns true if all values in StoredValues and
468  /// LastAccesses are not nullptrs.
469  bool isFilled() {
470  const unsigned NumValues = StoredValues.size();
471  for (unsigned I = 0; I < NumValues; ++I) {
472  if (!StoredValues[I] || !LastAccesses[I])
473  return false;
474  }
475 
476  return true;
477  }
478 };
479 
480 struct OpenMPOpt {
481 
482  using OptimizationRemarkGetter =
484 
485  OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater,
486  OptimizationRemarkGetter OREGetter,
487  OMPInformationCache &OMPInfoCache, Attributor &A)
488  : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater),
489  OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {}
490 
491  /// Check if any remarks are enabled for openmp-opt
492  bool remarksEnabled() {
493  auto &Ctx = M.getContext();
494  return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(DEBUG_TYPE);
495  }
496 
497  /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice.
498  bool run() {
499  if (SCC.empty())
500  return false;
501 
502  bool Changed = false;
503 
504  LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size()
505  << " functions in a slice with "
506  << OMPInfoCache.ModuleSlice.size() << " functions\n");
507 
508  if (PrintICVValues)
509  printICVs();
510  if (PrintOpenMPKernels)
511  printKernels();
512 
513  Changed |= rewriteDeviceCodeStateMachine();
514 
515  Changed |= runAttributor();
516 
517  // Recollect uses, in case Attributor deleted any.
518  OMPInfoCache.recollectUses();
519 
520  Changed |= deleteParallelRegions();
522  Changed |= hideMemTransfersLatency();
523  if (remarksEnabled())
524  analysisGlobalization();
525  Changed |= deduplicateRuntimeCalls();
527  if (mergeParallelRegions()) {
528  deduplicateRuntimeCalls();
529  Changed = true;
530  }
531  }
532 
533  return Changed;
534  }
535 
536  /// Print initial ICV values for testing.
537  /// FIXME: This should be done from the Attributor once it is added.
538  void printICVs() const {
539  InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel,
540  ICV_proc_bind};
541 
542  for (Function *F : OMPInfoCache.ModuleSlice) {
543  for (auto ICV : ICVs) {
544  auto ICVInfo = OMPInfoCache.ICVs[ICV];
545  auto Remark = [&](OptimizationRemark OR) {
546  return OR << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name)
547  << " Value: "
548  << (ICVInfo.InitValue
549  ? ICVInfo.InitValue->getValue().toString(10, true)
550  : "IMPLEMENTATION_DEFINED");
551  };
552 
553  emitRemarkOnFunction(F, "OpenMPICVTracker", Remark);
554  }
555  }
556  }
557 
558  /// Print OpenMP GPU kernels for testing.
559  void printKernels() const {
560  for (Function *F : SCC) {
561  if (!OMPInfoCache.Kernels.count(F))
562  continue;
563 
564  auto Remark = [&](OptimizationRemark OR) {
565  return OR << "OpenMP GPU kernel "
566  << ore::NV("OpenMPGPUKernel", F->getName()) << "\n";
567  };
568 
569  emitRemarkOnFunction(F, "OpenMPGPU", Remark);
570  }
571  }
572 
573  /// Return the call if \p U is a callee use in a regular call. If \p RFI is
574  /// given it has to be the callee or a nullptr is returned.
575  static CallInst *getCallIfRegularCall(
576  Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
577  CallInst *CI = dyn_cast<CallInst>(U.getUser());
578  if (CI && CI->isCallee(&U) && !CI->hasOperandBundles() &&
579  (!RFI || CI->getCalledFunction() == RFI->Declaration))
580  return CI;
581  return nullptr;
582  }
583 
584  /// Return the call if \p V is a regular call. If \p RFI is given it has to be
585  /// the callee or a nullptr is returned.
586  static CallInst *getCallIfRegularCall(
587  Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
588  CallInst *CI = dyn_cast<CallInst>(&V);
589  if (CI && !CI->hasOperandBundles() &&
590  (!RFI || CI->getCalledFunction() == RFI->Declaration))
591  return CI;
592  return nullptr;
593  }
594 
595 private:
596  /// Merge parallel regions when it is safe.
597  bool mergeParallelRegions() {
598  const unsigned CallbackCalleeOperand = 2;
599  const unsigned CallbackFirstArgOperand = 3;
600  using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
601 
602  // Check if there are any __kmpc_fork_call calls to merge.
603  OMPInformationCache::RuntimeFunctionInfo &RFI =
604  OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
605 
606  if (!RFI.Declaration)
607  return false;
608 
609  // Unmergable calls that prevent merging a parallel region.
610  OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
611  OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],
612  OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],
613  };
614 
615  bool Changed = false;
616  LoopInfo *LI = nullptr;
617  DominatorTree *DT = nullptr;
618 
620 
621  BasicBlock *StartBB = nullptr, *EndBB = nullptr;
622  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
623  BasicBlock &ContinuationIP) {
624  BasicBlock *CGStartBB = CodeGenIP.getBlock();
625  BasicBlock *CGEndBB =
626  SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
627  assert(StartBB != nullptr && "StartBB should not be null");
628  CGStartBB->getTerminator()->setSuccessor(0, StartBB);
629  assert(EndBB != nullptr && "EndBB should not be null");
630  EndBB->getTerminator()->setSuccessor(0, CGEndBB);
631  };
632 
633  auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &,
634  Value &Inner, Value *&ReplacementValue) -> InsertPointTy {
635  ReplacementValue = &Inner;
636  return CodeGenIP;
637  };
638 
639  auto FiniCB = [&](InsertPointTy CodeGenIP) {};
640 
641  /// Create a sequential execution region within a merged parallel region,
642  /// encapsulated in a master construct with a barrier for synchronization.
643  auto CreateSequentialRegion = [&](Function *OuterFn,
644  BasicBlock *OuterPredBB,
645  Instruction *SeqStartI,
646  Instruction *SeqEndI) {
647  // Isolate the instructions of the sequential region to a separate
648  // block.
649  BasicBlock *ParentBB = SeqStartI->getParent();
650  BasicBlock *SeqEndBB =
651  SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);
652  BasicBlock *SeqAfterBB =
653  SplitBlock(SeqEndBB, &*SeqEndBB->getFirstInsertionPt(), DT, LI);
654  BasicBlock *SeqStartBB =
655  SplitBlock(ParentBB, SeqStartI, DT, LI, nullptr, "seq.par.merged");
656 
657  assert(ParentBB->getUniqueSuccessor() == SeqStartBB &&
658  "Expected a different CFG");
659  const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
660  ParentBB->getTerminator()->eraseFromParent();
661 
662  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
663  BasicBlock &ContinuationIP) {
664  BasicBlock *CGStartBB = CodeGenIP.getBlock();
665  BasicBlock *CGEndBB =
666  SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
667  assert(SeqStartBB != nullptr && "SeqStartBB should not be null");
668  CGStartBB->getTerminator()->setSuccessor(0, SeqStartBB);
669  assert(SeqEndBB != nullptr && "SeqEndBB should not be null");
670  SeqEndBB->getTerminator()->setSuccessor(0, CGEndBB);
671  };
672  auto FiniCB = [&](InsertPointTy CodeGenIP) {};
673 
674  // Find outputs from the sequential region to outside users and
675  // broadcast their values to them.
676  for (Instruction &I : *SeqStartBB) {
677  SmallPtrSet<Instruction *, 4> OutsideUsers;
678  for (User *Usr : I.users()) {
679  Instruction &UsrI = *cast<Instruction>(Usr);
680  // Ignore outputs to LT intrinsics, code extraction for the merged
681  // parallel region will fix them.
682  if (UsrI.isLifetimeStartOrEnd())
683  continue;
684 
685  if (UsrI.getParent() != SeqStartBB)
686  OutsideUsers.insert(&UsrI);
687  }
688 
689  if (OutsideUsers.empty())
690  continue;
691 
692  // Emit an alloca in the outer region to store the broadcasted
693  // value.
694  const DataLayout &DL = M.getDataLayout();
695  AllocaInst *AllocaI = new AllocaInst(
696  I.getType(), DL.getAllocaAddrSpace(), nullptr,
697  I.getName() + ".seq.output.alloc", &OuterFn->front().front());
698 
699  // Emit a store instruction in the sequential BB to update the
700  // value.
701  new StoreInst(&I, AllocaI, SeqStartBB->getTerminator());
702 
703  // Emit a load instruction and replace the use of the output value
704  // with it.
705  for (Instruction *UsrI : OutsideUsers) {
706  LoadInst *LoadI = new LoadInst(I.getType(), AllocaI,
707  I.getName() + ".seq.output.load", UsrI);
708  UsrI->replaceUsesOfWith(&I, LoadI);
709  }
710  }
711 
713  InsertPointTy(ParentBB, ParentBB->end()), DL);
714  InsertPointTy SeqAfterIP =
715  OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB);
716 
717  OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel);
718 
719  BranchInst::Create(SeqAfterBB, SeqAfterIP.getBlock());
720 
721  LLVM_DEBUG(dbgs() << TAG << "After sequential inlining " << *OuterFn
722  << "\n");
723  };
724 
725  // Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all
726  // contained in BB and only separated by instructions that can be
727  // redundantly executed in parallel. The block BB is split before the first
728  // call (in MergableCIs) and after the last so the entire region we merge
729  // into a single parallel region is contained in a single basic block
730  // without any other instructions. We use the OpenMPIRBuilder to outline
731  // that block and call the resulting function via __kmpc_fork_call.
732  auto Merge = [&](SmallVectorImpl<CallInst *> &MergableCIs, BasicBlock *BB) {
733  // TODO: Change the interface to allow single CIs expanded, e.g, to
734  // include an outer loop.
735  assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs");
736 
737  auto Remark = [&](OptimizationRemark OR) {
738  OR << "Parallel region at "
739  << ore::NV("OpenMPParallelMergeFront",
740  MergableCIs.front()->getDebugLoc())
741  << " merged with parallel regions at ";
742  for (auto *CI : llvm::drop_begin(MergableCIs, 1)) {
743  OR << ore::NV("OpenMPParallelMerge", CI->getDebugLoc());
744  if (CI != MergableCIs.back())
745  OR << ", ";
746  }
747  return OR;
748  };
749 
750  emitRemark<OptimizationRemark>(MergableCIs.front(),
751  "OpenMPParallelRegionMerging", Remark);
752 
753  Function *OriginalFn = BB->getParent();
754  LLVM_DEBUG(dbgs() << TAG << "Merge " << MergableCIs.size()
755  << " parallel regions in " << OriginalFn->getName()
756  << "\n");
757 
758  // Isolate the calls to merge in a separate block.
759  EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI);
760  BasicBlock *AfterBB =
761  SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
762  StartBB = SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr,
763  "omp.par.merged");
764 
765  assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG");
766  const DebugLoc DL = BB->getTerminator()->getDebugLoc();
767  BB->getTerminator()->eraseFromParent();
768 
769  // Create sequential regions for sequential instructions that are
770  // in-between mergable parallel regions.
771  for (auto *It = MergableCIs.begin(), *End = MergableCIs.end() - 1;
772  It != End; ++It) {
773  Instruction *ForkCI = *It;
774  Instruction *NextForkCI = *(It + 1);
775 
776  // Continue if there are not in-between instructions.
777  if (ForkCI->getNextNode() == NextForkCI)
778  continue;
779 
780  CreateSequentialRegion(OriginalFn, BB, ForkCI->getNextNode(),
781  NextForkCI->getPrevNode());
782  }
783 
784  OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),
785  DL);
786  IRBuilder<>::InsertPoint AllocaIP(
787  &OriginalFn->getEntryBlock(),
788  OriginalFn->getEntryBlock().getFirstInsertionPt());
789  // Create the merged parallel region with default proc binding, to
790  // avoid overriding binding settings, and without explicit cancellation.
791  InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.createParallel(
792  Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr,
793  OMP_PROC_BIND_default, /* IsCancellable */ false);
794  BranchInst::Create(AfterBB, AfterIP.getBlock());
795 
796  // Perform the actual outlining.
797  OMPInfoCache.OMPBuilder.finalize(/* AllowExtractorSinking */ true);
798 
799  Function *OutlinedFn = MergableCIs.front()->getCaller();
800 
801  // Replace the __kmpc_fork_call calls with direct calls to the outlined
802  // callbacks.
804  for (auto *CI : MergableCIs) {
805  Value *Callee =
806  CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts();
807  FunctionType *FT =
808  cast<FunctionType>(Callee->getType()->getPointerElementType());
809  Args.clear();
810  Args.push_back(OutlinedFn->getArg(0));
811  Args.push_back(OutlinedFn->getArg(1));
812  for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands();
813  U < E; ++U)
814  Args.push_back(CI->getArgOperand(U));
815 
816  CallInst *NewCI = CallInst::Create(FT, Callee, Args, "", CI);
817  if (CI->getDebugLoc())
818  NewCI->setDebugLoc(CI->getDebugLoc());
819 
820  // Forward parameter attributes from the callback to the callee.
821  for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands();
822  U < E; ++U)
823  for (const Attribute &A : CI->getAttributes().getParamAttributes(U))
824  NewCI->addParamAttr(
825  U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
826 
827  // Emit an explicit barrier to replace the implicit fork-join barrier.
828  if (CI != MergableCIs.back()) {
829  // TODO: Remove barrier if the merged parallel region includes the
830  // 'nowait' clause.
831  OMPInfoCache.OMPBuilder.createBarrier(
832  InsertPointTy(NewCI->getParent(),
833  NewCI->getNextNode()->getIterator()),
834  OMPD_parallel);
835  }
836 
837  auto Remark = [&](OptimizationRemark OR) {
838  return OR << "Parallel region at "
839  << ore::NV("OpenMPParallelMerge", CI->getDebugLoc())
840  << " merged with "
841  << ore::NV("OpenMPParallelMergeFront",
842  MergableCIs.front()->getDebugLoc());
843  };
844  if (CI != MergableCIs.front())
845  emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionMerging",
846  Remark);
847 
848  CI->eraseFromParent();
849  }
850 
851  assert(OutlinedFn != OriginalFn && "Outlining failed");
852  CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn);
853  CGUpdater.reanalyzeFunction(*OriginalFn);
854 
855  NumOpenMPParallelRegionsMerged += MergableCIs.size();
856 
857  return true;
858  };
859 
860  // Helper function that identifes sequences of
861  // __kmpc_fork_call uses in a basic block.
862  auto DetectPRsCB = [&](Use &U, Function &F) {
863  CallInst *CI = getCallIfRegularCall(U, &RFI);
864  BB2PRMap[CI->getParent()].insert(CI);
865 
866  return false;
867  };
868 
869  BB2PRMap.clear();
870  RFI.foreachUse(SCC, DetectPRsCB);
871  SmallVector<SmallVector<CallInst *, 4>, 4> MergableCIsVector;
872  // Find mergable parallel regions within a basic block that are
873  // safe to merge, that is any in-between instructions can safely
874  // execute in parallel after merging.
875  // TODO: support merging across basic-blocks.
876  for (auto &It : BB2PRMap) {
877  auto &CIs = It.getSecond();
878  if (CIs.size() < 2)
879  continue;
880 
881  BasicBlock *BB = It.getFirst();
882  SmallVector<CallInst *, 4> MergableCIs;
883 
884  /// Returns true if the instruction is mergable, false otherwise.
885  /// A terminator instruction is unmergable by definition since merging
886  /// works within a BB. Instructions before the mergable region are
887  /// mergable if they are not calls to OpenMP runtime functions that may
888  /// set different execution parameters for subsequent parallel regions.
889  /// Instructions in-between parallel regions are mergable if they are not
890  /// calls to any non-intrinsic function since that may call a non-mergable
891  /// OpenMP runtime function.
892  auto IsMergable = [&](Instruction &I, bool IsBeforeMergableRegion) {
893  // We do not merge across BBs, hence return false (unmergable) if the
894  // instruction is a terminator.
895  if (I.isTerminator())
896  return false;
897 
898  if (!isa<CallInst>(&I))
899  return true;
900 
901  CallInst *CI = cast<CallInst>(&I);
902  if (IsBeforeMergableRegion) {
903  Function *CalledFunction = CI->getCalledFunction();
904  if (!CalledFunction)
905  return false;
906  // Return false (unmergable) if the call before the parallel
907  // region calls an explicit affinity (proc_bind) or number of
908  // threads (num_threads) compiler-generated function. Those settings
909  // may be incompatible with following parallel regions.
910  // TODO: ICV tracking to detect compatibility.
911  for (const auto &RFI : UnmergableCallsInfo) {
912  if (CalledFunction == RFI.Declaration)
913  return false;
914  }
915  } else {
916  // Return false (unmergable) if there is a call instruction
917  // in-between parallel regions when it is not an intrinsic. It
918  // may call an unmergable OpenMP runtime function in its callpath.
919  // TODO: Keep track of possible OpenMP calls in the callpath.
920  if (!isa<IntrinsicInst>(CI))
921  return false;
922  }
923 
924  return true;
925  };
926  // Find maximal number of parallel region CIs that are safe to merge.
927  for (auto It = BB->begin(), End = BB->end(); It != End;) {
928  Instruction &I = *It;
929  ++It;
930 
931  if (CIs.count(&I)) {
932  MergableCIs.push_back(cast<CallInst>(&I));
933  continue;
934  }
935 
936  // Continue expanding if the instruction is mergable.
937  if (IsMergable(I, MergableCIs.empty()))
938  continue;
939 
940  // Forward the instruction iterator to skip the next parallel region
941  // since there is an unmergable instruction which can affect it.
942  for (; It != End; ++It) {
943  Instruction &SkipI = *It;
944  if (CIs.count(&SkipI)) {
945  LLVM_DEBUG(dbgs() << TAG << "Skip parallel region " << SkipI
946  << " due to " << I << "\n");
947  ++It;
948  break;
949  }
950  }
951 
952  // Store mergable regions found.
953  if (MergableCIs.size() > 1) {
954  MergableCIsVector.push_back(MergableCIs);
955  LLVM_DEBUG(dbgs() << TAG << "Found " << MergableCIs.size()
956  << " parallel regions in block " << BB->getName()
957  << " of function " << BB->getParent()->getName()
958  << "\n";);
959  }
960 
961  MergableCIs.clear();
962  }
963 
964  if (!MergableCIsVector.empty()) {
965  Changed = true;
966 
967  for (auto &MergableCIs : MergableCIsVector)
968  Merge(MergableCIs, BB);
969  }
970  }
971 
972  if (Changed) {
973  /// Re-collect use for fork calls, emitted barrier calls, and
974  /// any emitted master/end_master calls.
975  OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call);
976  OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier);
977  OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master);
978  OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master);
979  }
980 
981  return Changed;
982  }
983 
984  /// Try to delete parallel regions if possible.
985  bool deleteParallelRegions() {
986  const unsigned CallbackCalleeOperand = 2;
987 
988  OMPInformationCache::RuntimeFunctionInfo &RFI =
989  OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
990 
991  if (!RFI.Declaration)
992  return false;
993 
994  bool Changed = false;
995  auto DeleteCallCB = [&](Use &U, Function &) {
996  CallInst *CI = getCallIfRegularCall(U);
997  if (!CI)
998  return false;
999  auto *Fn = dyn_cast<Function>(
1000  CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts());
1001  if (!Fn)
1002  return false;
1003  if (!Fn->onlyReadsMemory())
1004  return false;
1005  if (!Fn->hasFnAttribute(Attribute::WillReturn))
1006  return false;
1007 
1008  LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in "
1009  << CI->getCaller()->getName() << "\n");
1010 
1011  auto Remark = [&](OptimizationRemark OR) {
1012  return OR << "Parallel region in "
1013  << ore::NV("OpenMPParallelDelete", CI->getCaller()->getName())
1014  << " deleted";
1015  };
1016  emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionDeletion",
1017  Remark);
1018 
1019  CGUpdater.removeCallSite(*CI);
1020  CI->eraseFromParent();
1021  Changed = true;
1022  ++NumOpenMPParallelRegionsDeleted;
1023  return true;
1024  };
1025 
1026  RFI.foreachUse(SCC, DeleteCallCB);
1027 
1028  return Changed;
1029  }
1030 
1031  /// Try to eliminate runtime calls by reusing existing ones.
1032  bool deduplicateRuntimeCalls() {
1033  bool Changed = false;
1034 
1035  RuntimeFunction DeduplicableRuntimeCallIDs[] = {
1036  OMPRTL_omp_get_num_threads,
1037  OMPRTL_omp_in_parallel,
1038  OMPRTL_omp_get_cancellation,
1039  OMPRTL_omp_get_thread_limit,
1040  OMPRTL_omp_get_supported_active_levels,
1041  OMPRTL_omp_get_level,
1042  OMPRTL_omp_get_ancestor_thread_num,
1043  OMPRTL_omp_get_team_size,
1044  OMPRTL_omp_get_active_level,
1045  OMPRTL_omp_in_final,
1046  OMPRTL_omp_get_proc_bind,
1047  OMPRTL_omp_get_num_places,
1048  OMPRTL_omp_get_num_procs,
1049  OMPRTL_omp_get_place_num,
1050  OMPRTL_omp_get_partition_num_places,
1051  OMPRTL_omp_get_partition_place_nums};
1052 
1053  // Global-tid is handled separately.
1054  SmallSetVector<Value *, 16> GTIdArgs;
1055  collectGlobalThreadIdArguments(GTIdArgs);
1056  LLVM_DEBUG(dbgs() << TAG << "Found " << GTIdArgs.size()
1057  << " global thread ID arguments\n");
1058 
1059  for (Function *F : SCC) {
1060  for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
1061  Changed |= deduplicateRuntimeCalls(
1062  *F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
1063 
1064  // __kmpc_global_thread_num is special as we can replace it with an
1065  // argument in enough cases to make it worth trying.
1066  Value *GTIdArg = nullptr;
1067  for (Argument &Arg : F->args())
1068  if (GTIdArgs.count(&Arg)) {
1069  GTIdArg = &Arg;
1070  break;
1071  }
1072  Changed |= deduplicateRuntimeCalls(
1073  *F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);
1074  }
1075 
1076  return Changed;
1077  }
1078 
1079  /// Tries to hide the latency of runtime calls that involve host to
1080  /// device memory transfers by splitting them into their "issue" and "wait"
1081  /// versions. The "issue" is moved upwards as much as possible. The "wait" is
1082  /// moved downards as much as possible. The "issue" issues the memory transfer
1083  /// asynchronously, returning a handle. The "wait" waits in the returned
1084  /// handle for the memory transfer to finish.
1085  bool hideMemTransfersLatency() {
1086  auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
1087  bool Changed = false;
1088  auto SplitMemTransfers = [&](Use &U, Function &Decl) {
1089  auto *RTCall = getCallIfRegularCall(U, &RFI);
1090  if (!RTCall)
1091  return false;
1092 
1093  OffloadArray OffloadArrays[3];
1094  if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
1095  return false;
1096 
1097  LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));
1098 
1099  // TODO: Check if can be moved upwards.
1100  bool WasSplit = false;
1101  Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
1102  if (WaitMovementPoint)
1103  WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);
1104 
1105  Changed |= WasSplit;
1106  return WasSplit;
1107  };
1108  RFI.foreachUse(SCC, SplitMemTransfers);
1109 
1110  return Changed;
1111  }
1112 
1113  void analysisGlobalization() {
1114  RuntimeFunction GlobalizationRuntimeIDs[] = {
1115  OMPRTL___kmpc_data_sharing_coalesced_push_stack,
1116  OMPRTL___kmpc_data_sharing_push_stack};
1117 
1118  for (const auto GlobalizationCallID : GlobalizationRuntimeIDs) {
1119  auto &RFI = OMPInfoCache.RFIs[GlobalizationCallID];
1120 
1121  auto CheckGlobalization = [&](Use &U, Function &Decl) {
1122  if (CallInst *CI = getCallIfRegularCall(U, &RFI)) {
1123  auto Remark = [&](OptimizationRemarkAnalysis ORA) {
1124  return ORA
1125  << "Found thread data sharing on the GPU. "
1126  << "Expect degraded performance due to data globalization.";
1127  };
1128  emitRemark<OptimizationRemarkAnalysis>(CI, "OpenMPGlobalization",
1129  Remark);
1130  }
1131 
1132  return false;
1133  };
1134 
1135  RFI.foreachUse(SCC, CheckGlobalization);
1136  }
1137  }
1138 
1139  /// Maps the values stored in the offload arrays passed as arguments to
1140  /// \p RuntimeCall into the offload arrays in \p OAs.
1141  bool getValuesInOffloadArrays(CallInst &RuntimeCall,
1143  assert(OAs.size() == 3 && "Need space for three offload arrays!");
1144 
1145  // A runtime call that involves memory offloading looks something like:
1146  // call void @__tgt_target_data_begin_mapper(arg0, arg1,
1147  // i8** %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes,
1148  // ...)
1149  // So, the idea is to access the allocas that allocate space for these
1150  // offload arrays, offload_baseptrs, offload_ptrs, offload_sizes.
1151  // Therefore:
1152  // i8** %offload_baseptrs.
1153  Value *BasePtrsArg =
1154  RuntimeCall.getArgOperand(OffloadArray::BasePtrsArgNum);
1155  // i8** %offload_ptrs.
1156  Value *PtrsArg = RuntimeCall.getArgOperand(OffloadArray::PtrsArgNum);
1157  // i8** %offload_sizes.
1158  Value *SizesArg = RuntimeCall.getArgOperand(OffloadArray::SizesArgNum);
1159 
1160  // Get values stored in **offload_baseptrs.
1161  auto *V = getUnderlyingObject(BasePtrsArg);
1162  if (!isa<AllocaInst>(V))
1163  return false;
1164  auto *BasePtrsArray = cast<AllocaInst>(V);
1165  if (!OAs[0].initialize(*BasePtrsArray, RuntimeCall))
1166  return false;
1167 
1168  // Get values stored in **offload_baseptrs.
1169  V = getUnderlyingObject(PtrsArg);
1170  if (!isa<AllocaInst>(V))
1171  return false;
1172  auto *PtrsArray = cast<AllocaInst>(V);
1173  if (!OAs[1].initialize(*PtrsArray, RuntimeCall))
1174  return false;
1175 
1176  // Get values stored in **offload_sizes.
1177  V = getUnderlyingObject(SizesArg);
1178  // If it's a [constant] global array don't analyze it.
1179  if (isa<GlobalValue>(V))
1180  return isa<Constant>(V);
1181  if (!isa<AllocaInst>(V))
1182  return false;
1183 
1184  auto *SizesArray = cast<AllocaInst>(V);
1185  if (!OAs[2].initialize(*SizesArray, RuntimeCall))
1186  return false;
1187 
1188  return true;
1189  }
1190 
1191  /// Prints the values in the OffloadArrays \p OAs using LLVM_DEBUG.
1192  /// For now this is a way to test that the function getValuesInOffloadArrays
1193  /// is working properly.
1194  /// TODO: Move this to a unittest when unittests are available for OpenMPOpt.
1195  void dumpValuesInOffloadArrays(ArrayRef<OffloadArray> OAs) {
1196  assert(OAs.size() == 3 && "There are three offload arrays to debug!");
1197 
1198  LLVM_DEBUG(dbgs() << TAG << " Successfully got offload values:\n");
1199  std::string ValuesStr;
1200  raw_string_ostream Printer(ValuesStr);
1201  std::string Separator = " --- ";
1202 
1203  for (auto *BP : OAs[0].StoredValues) {
1204  BP->print(Printer);
1205  Printer << Separator;
1206  }
1207  LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << Printer.str() << "\n");
1208  ValuesStr.clear();
1209 
1210  for (auto *P : OAs[1].StoredValues) {
1211  P->print(Printer);
1212  Printer << Separator;
1213  }
1214  LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << Printer.str() << "\n");
1215  ValuesStr.clear();
1216 
1217  for (auto *S : OAs[2].StoredValues) {
1218  S->print(Printer);
1219  Printer << Separator;
1220  }
1221  LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n");
1222  }
1223 
1224  /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be
1225  /// moved. Returns nullptr if the movement is not possible, or not worth it.
1226  Instruction *canBeMovedDownwards(CallInst &RuntimeCall) {
1227  // FIXME: This traverses only the BasicBlock where RuntimeCall is.
1228  // Make it traverse the CFG.
1229 
1230  Instruction *CurrentI = &RuntimeCall;
1231  bool IsWorthIt = false;
1232  while ((CurrentI = CurrentI->getNextNode())) {
1233 
1234  // TODO: Once we detect the regions to be offloaded we should use the
1235  // alias analysis manager to check if CurrentI may modify one of
1236  // the offloaded regions.
1237  if (CurrentI->mayHaveSideEffects() || CurrentI->mayReadFromMemory()) {
1238  if (IsWorthIt)
1239  return CurrentI;
1240 
1241  return nullptr;
1242  }
1243 
1244  // FIXME: For now if we move it over anything without side effect
1245  // is worth it.
1246  IsWorthIt = true;
1247  }
1248 
1249  // Return end of BasicBlock.
1250  return RuntimeCall.getParent()->getTerminator();
1251  }
1252 
1253  /// Splits \p RuntimeCall into its "issue" and "wait" counterparts.
1254  bool splitTargetDataBeginRTC(CallInst &RuntimeCall,
1255  Instruction &WaitMovementPoint) {
1256  // Create stack allocated handle (__tgt_async_info) at the beginning of the
1257  // function. Used for storing information of the async transfer, allowing to
1258  // wait on it later.
1259  auto &IRBuilder = OMPInfoCache.OMPBuilder;
1260  auto *F = RuntimeCall.getCaller();
1261  Instruction *FirstInst = &(F->getEntryBlock().front());
1262  AllocaInst *Handle = new AllocaInst(
1263  IRBuilder.AsyncInfo, F->getAddressSpace(), "handle", FirstInst);
1264 
1265  // Add "issue" runtime call declaration:
1266  // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32,
1267  // i8**, i8**, i64*, i64*)
1268  FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction(
1269  M, OMPRTL___tgt_target_data_begin_mapper_issue);
1270 
1271  // Change RuntimeCall call site for its asynchronous version.
1273  for (auto &Arg : RuntimeCall.args())
1274  Args.push_back(Arg.get());
1275  Args.push_back(Handle);
1276 
1277  CallInst *IssueCallsite =
1278  CallInst::Create(IssueDecl, Args, /*NameStr=*/"", &RuntimeCall);
1279  RuntimeCall.eraseFromParent();
1280 
1281  // Add "wait" runtime call declaration:
1282  // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info)
1283  FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction(
1284  M, OMPRTL___tgt_target_data_begin_mapper_wait);
1285 
1286  Value *WaitParams[2] = {
1287  IssueCallsite->getArgOperand(
1288  OffloadArray::DeviceIDArgNum), // device_id.
1289  Handle // handle to wait on.
1290  };
1291  CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint);
1292 
1293  return true;
1294  }
1295 
1296  static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent,
1297  bool GlobalOnly, bool &SingleChoice) {
1298  if (CurrentIdent == NextIdent)
1299  return CurrentIdent;
1300 
1301  // TODO: Figure out how to actually combine multiple debug locations. For
1302  // now we just keep an existing one if there is a single choice.
1303  if (!GlobalOnly || isa<GlobalValue>(NextIdent)) {
1304  SingleChoice = !CurrentIdent;
1305  return NextIdent;
1306  }
1307  return nullptr;
1308  }
1309 
1310  /// Return an `struct ident_t*` value that represents the ones used in the
1311  /// calls of \p RFI inside of \p F. If \p GlobalOnly is true, we will not
1312  /// return a local `struct ident_t*`. For now, if we cannot find a suitable
1313  /// return value we create one from scratch. We also do not yet combine
1314  /// information, e.g., the source locations, see combinedIdentStruct.
1315  Value *
1316  getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
1317  Function &F, bool GlobalOnly) {
1318  bool SingleChoice = true;
1319  Value *Ident = nullptr;
1320  auto CombineIdentStruct = [&](Use &U, Function &Caller) {
1321  CallInst *CI = getCallIfRegularCall(U, &RFI);
1322  if (!CI || &F != &Caller)
1323  return false;
1324  Ident = combinedIdentStruct(Ident, CI->getArgOperand(0),
1325  /* GlobalOnly */ true, SingleChoice);
1326  return false;
1327  };
1328  RFI.foreachUse(SCC, CombineIdentStruct);
1329 
1330  if (!Ident || !SingleChoice) {
1331  // The IRBuilder uses the insertion block to get to the module, this is
1332  // unfortunate but we work around it for now.
1333  if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
1334  OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy(
1335  &F.getEntryBlock(), F.getEntryBlock().begin()));
1336  // Create a fallback location if non was found.
1337  // TODO: Use the debug locations of the calls instead.
1338  Constant *Loc = OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr();
1339  Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc);
1340  }
1341  return Ident;
1342  }
1343 
1344  /// Try to eliminate calls of \p RFI in \p F by reusing an existing one or
1345  /// \p ReplVal if given.
1346  bool deduplicateRuntimeCalls(Function &F,
1347  OMPInformationCache::RuntimeFunctionInfo &RFI,
1348  Value *ReplVal = nullptr) {
1349  auto *UV = RFI.getUseVector(F);
1350  if (!UV || UV->size() + (ReplVal != nullptr) < 2)
1351  return false;
1352 
1353  LLVM_DEBUG(
1354  dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Name
1355  << (ReplVal ? " with an existing value\n" : "\n") << "\n");
1356 
1357  assert((!ReplVal || (isa<Argument>(ReplVal) &&
1358  cast<Argument>(ReplVal)->getParent() == &F)) &&
1359  "Unexpected replacement value!");
1360 
1361  // TODO: Use dominance to find a good position instead.
1362  auto CanBeMoved = [this](CallBase &CB) {
1363  unsigned NumArgs = CB.getNumArgOperands();
1364  if (NumArgs == 0)
1365  return true;
1366  if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
1367  return false;
1368  for (unsigned u = 1; u < NumArgs; ++u)
1369  if (isa<Instruction>(CB.getArgOperand(u)))
1370  return false;
1371  return true;
1372  };
1373 
1374  if (!ReplVal) {
1375  for (Use *U : *UV)
1376  if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
1377  if (!CanBeMoved(*CI))
1378  continue;
1379 
1380  auto Remark = [&](OptimizationRemark OR) {
1381  auto newLoc = &*F.getEntryBlock().getFirstInsertionPt();
1382  return OR << "OpenMP runtime call "
1383  << ore::NV("OpenMPOptRuntime", RFI.Name) << " moved to "
1384  << ore::NV("OpenMPRuntimeMoves", newLoc->getDebugLoc());
1385  };
1386  emitRemark<OptimizationRemark>(CI, "OpenMPRuntimeCodeMotion", Remark);
1387 
1388  CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt());
1389  ReplVal = CI;
1390  break;
1391  }
1392  if (!ReplVal)
1393  return false;
1394  }
1395 
1396  // If we use a call as a replacement value we need to make sure the ident is
1397  // valid at the new location. For now we just pick a global one, either
1398  // existing and used by one of the calls, or created from scratch.
1399  if (CallBase *CI = dyn_cast<CallBase>(ReplVal)) {
1400  if (CI->getNumArgOperands() > 0 &&
1401  CI->getArgOperand(0)->getType() == OMPInfoCache.OMPBuilder.IdentPtr) {
1402  Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F,
1403  /* GlobalOnly */ true);
1404  CI->setArgOperand(0, Ident);
1405  }
1406  }
1407 
1408  bool Changed = false;
1409  auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) {
1410  CallInst *CI = getCallIfRegularCall(U, &RFI);
1411  if (!CI || CI == ReplVal || &F != &Caller)
1412  return false;
1413  assert(CI->getCaller() == &F && "Unexpected call!");
1414 
1415  auto Remark = [&](OptimizationRemark OR) {
1416  return OR << "OpenMP runtime call "
1417  << ore::NV("OpenMPOptRuntime", RFI.Name) << " deduplicated";
1418  };
1419  emitRemark<OptimizationRemark>(CI, "OpenMPRuntimeDeduplicated", Remark);
1420 
1421  CGUpdater.removeCallSite(*CI);
1422  CI->replaceAllUsesWith(ReplVal);
1423  CI->eraseFromParent();
1424  ++NumOpenMPRuntimeCallsDeduplicated;
1425  Changed = true;
1426  return true;
1427  };
1428  RFI.foreachUse(SCC, ReplaceAndDeleteCB);
1429 
1430  return Changed;
1431  }
1432 
1433  /// Collect arguments that represent the global thread id in \p GTIdArgs.
1434  void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> &GTIdArgs) {
1435  // TODO: Below we basically perform a fixpoint iteration with a pessimistic
1436  // initialization. We could define an AbstractAttribute instead and
1437  // run the Attributor here once it can be run as an SCC pass.
1438 
1439  // Helper to check the argument \p ArgNo at all call sites of \p F for
1440  // a GTId.
1441  auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) {
1442  if (!F.hasLocalLinkage())
1443  return false;
1444  for (Use &U : F.uses()) {
1445  if (CallInst *CI = getCallIfRegularCall(U)) {
1446  Value *ArgOp = CI->getArgOperand(ArgNo);
1447  if (CI == &RefCI || GTIdArgs.count(ArgOp) ||
1448  getCallIfRegularCall(
1449  *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))
1450  continue;
1451  }
1452  return false;
1453  }
1454  return true;
1455  };
1456 
1457  // Helper to identify uses of a GTId as GTId arguments.
1458  auto AddUserArgs = [&](Value &GTId) {
1459  for (Use &U : GTId.uses())
1460  if (CallInst *CI = dyn_cast<CallInst>(U.getUser()))
1461  if (CI->isArgOperand(&U))
1462  if (Function *Callee = CI->getCalledFunction())
1463  if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI))
1464  GTIdArgs.insert(Callee->getArg(U.getOperandNo()));
1465  };
1466 
1467  // The argument users of __kmpc_global_thread_num calls are GTIds.
1468  OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
1469  OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];
1470 
1471  GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &F) {
1472  if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))
1473  AddUserArgs(*CI);
1474  return false;
1475  });
1476 
1477  // Transitively search for more arguments by looking at the users of the
1478  // ones we know already. During the search the GTIdArgs vector is extended
1479  // so we cannot cache the size nor can we use a range based for.
1480  for (unsigned u = 0; u < GTIdArgs.size(); ++u)
1481  AddUserArgs(*GTIdArgs[u]);
1482  }
1483 
1484  /// Kernel (=GPU) optimizations and utility functions
1485  ///
1486  ///{{
1487 
1488  /// Check if \p F is a kernel, hence entry point for target offloading.
1489  bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); }
1490 
1491  /// Cache to remember the unique kernel for a function.
1492  DenseMap<Function *, Optional<Kernel>> UniqueKernelMap;
1493 
1494  /// Find the unique kernel that will execute \p F, if any.
1495  Kernel getUniqueKernelFor(Function &F);
1496 
1497  /// Find the unique kernel that will execute \p I, if any.
1498  Kernel getUniqueKernelFor(Instruction &I) {
1499  return getUniqueKernelFor(*I.getFunction());
1500  }
1501 
1502  /// Rewrite the device (=GPU) code state machine create in non-SPMD mode in
1503  /// the cases we can avoid taking the address of a function.
1504  bool rewriteDeviceCodeStateMachine();
1505 
1506  ///
1507  ///}}
1508 
1509  /// Emit a remark generically
1510  ///
1511  /// This template function can be used to generically emit a remark. The
1512  /// RemarkKind should be one of the following:
1513  /// - OptimizationRemark to indicate a successful optimization attempt
1514  /// - OptimizationRemarkMissed to report a failed optimization attempt
1515  /// - OptimizationRemarkAnalysis to provide additional information about an
1516  /// optimization attempt
1517  ///
1518  /// The remark is built using a callback function provided by the caller that
1519  /// takes a RemarkKind as input and returns a RemarkKind.
1520  template <typename RemarkKind,
1521  typename RemarkCallBack = function_ref<RemarkKind(RemarkKind &&)>>
1522  void emitRemark(Instruction *Inst, StringRef RemarkName,
1523  RemarkCallBack &&RemarkCB) const {
1524  Function *F = Inst->getParent()->getParent();
1525  auto &ORE = OREGetter(F);
1526 
1527  ORE.emit(
1528  [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, Inst)); });
1529  }
1530 
1531  /// Emit a remark on a function. Since only OptimizationRemark is supporting
1532  /// this, it can't be made generic.
1533  void
1534  emitRemarkOnFunction(Function *F, StringRef RemarkName,
1536  &&RemarkCB) const {
1537  auto &ORE = OREGetter(F);
1538 
1539  ORE.emit([&]() {
1540  return RemarkCB(OptimizationRemark(DEBUG_TYPE, RemarkName, F));
1541  });
1542  }
1543 
1544  /// The underlying module.
1545  Module &M;
1546 
1547  /// The SCC we are operating on.
1549 
1550  /// Callback to update the call graph, the first argument is a removed call,
1551  /// the second an optional replacement call.
1552  CallGraphUpdater &CGUpdater;
1553 
1554  /// Callback to get an OptimizationRemarkEmitter from a Function *
1555  OptimizationRemarkGetter OREGetter;
1556 
1557  /// OpenMP-specific information cache. Also Used for Attributor runs.
1558  OMPInformationCache &OMPInfoCache;
1559 
1560  /// Attributor instance.
1561  Attributor &A;
1562 
1563  /// Helper function to run Attributor on SCC.
1564  bool runAttributor() {
1565  if (SCC.empty())
1566  return false;
1567 
1568  registerAAs();
1569 
1570  ChangeStatus Changed = A.run();
1571 
1572  LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size()
1573  << " functions, result: " << Changed << ".\n");
1574 
1575  return Changed == ChangeStatus::CHANGED;
1576  }
1577 
1578  /// Populate the Attributor with abstract attribute opportunities in the
1579  /// function.
1580  void registerAAs() {
1581  if (SCC.empty())
1582  return;
1583 
1584  // Create CallSite AA for all Getters.
1585  for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) {
1586  auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)];
1587 
1588  auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
1589 
1590  auto CreateAA = [&](Use &U, Function &Caller) {
1591  CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
1592  if (!CI)
1593  return false;
1594 
1595  auto &CB = cast<CallBase>(*CI);
1596 
1598  A.getOrCreateAAFor<AAICVTracker>(CBPos);
1599  return false;
1600  };
1601 
1602  GetterRFI.foreachUse(SCC, CreateAA);
1603  }
1604  }
1605 };
1606 
1607 Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
1608  if (!OMPInfoCache.ModuleSlice.count(&F))
1609  return nullptr;
1610 
1611  // Use a scope to keep the lifetime of the CachedKernel short.
1612  {
1613  Optional<Kernel> &CachedKernel = UniqueKernelMap[&F];
1614  if (CachedKernel)
1615  return *CachedKernel;
1616 
1617  // TODO: We should use an AA to create an (optimistic and callback
1618  // call-aware) call graph. For now we stick to simple patterns that
1619  // are less powerful, basically the worst fixpoint.
1620  if (isKernel(F)) {
1621  CachedKernel = Kernel(&F);
1622  return *CachedKernel;
1623  }
1624 
1625  CachedKernel = nullptr;
1626  if (!F.hasLocalLinkage()) {
1627 
1628  // See https://openmp.llvm.org/remarks/OptimizationRemarks.html
1629  auto Remark = [&](OptimizationRemark OR) {
1630  return OR << "[OMP100] Potentially unknown OpenMP target region caller";
1631  };
1632  emitRemarkOnFunction(&F, "OMP100", Remark);
1633 
1634  return nullptr;
1635  }
1636  }
1637 
1638  auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel {
1639  if (auto *Cmp = dyn_cast<ICmpInst>(U.getUser())) {
1640  // Allow use in equality comparisons.
1641  if (Cmp->isEquality())
1642  return getUniqueKernelFor(*Cmp);
1643  return nullptr;
1644  }
1645  if (auto *CB = dyn_cast<CallBase>(U.getUser())) {
1646  // Allow direct calls.
1647  if (CB->isCallee(&U))
1648  return getUniqueKernelFor(*CB);
1649  // Allow the use in __kmpc_kernel_prepare_parallel calls.
1650  if (Function *Callee = CB->getCalledFunction())
1651  if (Callee->getName() == "__kmpc_kernel_prepare_parallel")
1652  return getUniqueKernelFor(*CB);
1653  return nullptr;
1654  }
1655  // Disallow every other use.
1656  return nullptr;
1657  };
1658 
1659  // TODO: In the future we want to track more than just a unique kernel.
1660  SmallPtrSet<Kernel, 2> PotentialKernels;
1661  OMPInformationCache::foreachUse(F, [&](const Use &U) {
1662  PotentialKernels.insert(GetUniqueKernelForUse(U));
1663  });
1664 
1665  Kernel K = nullptr;
1666  if (PotentialKernels.size() == 1)
1667  K = *PotentialKernels.begin();
1668 
1669  // Cache the result.
1670  UniqueKernelMap[&F] = K;
1671 
1672  return K;
1673 }
1674 
1675 bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
1676  OMPInformationCache::RuntimeFunctionInfo &KernelPrepareParallelRFI =
1677  OMPInfoCache.RFIs[OMPRTL___kmpc_kernel_prepare_parallel];
1678 
1679  bool Changed = false;
1680  if (!KernelPrepareParallelRFI)
1681  return Changed;
1682 
1683  for (Function *F : SCC) {
1684 
1685  // Check if the function is uses in a __kmpc_kernel_prepare_parallel call at
1686  // all.
1687  bool UnknownUse = false;
1688  bool KernelPrepareUse = false;
1689  unsigned NumDirectCalls = 0;
1690 
1691  SmallVector<Use *, 2> ToBeReplacedStateMachineUses;
1692  OMPInformationCache::foreachUse(*F, [&](Use &U) {
1693  if (auto *CB = dyn_cast<CallBase>(U.getUser()))
1694  if (CB->isCallee(&U)) {
1695  ++NumDirectCalls;
1696  return;
1697  }
1698 
1699  if (isa<ICmpInst>(U.getUser())) {
1700  ToBeReplacedStateMachineUses.push_back(&U);
1701  return;
1702  }
1703  if (!KernelPrepareUse && OpenMPOpt::getCallIfRegularCall(
1704  *U.getUser(), &KernelPrepareParallelRFI)) {
1705  KernelPrepareUse = true;
1706  ToBeReplacedStateMachineUses.push_back(&U);
1707  return;
1708  }
1709  UnknownUse = true;
1710  });
1711 
1712  // Do not emit a remark if we haven't seen a __kmpc_kernel_prepare_parallel
1713  // use.
1714  if (!KernelPrepareUse)
1715  continue;
1716 
1717  {
1718  auto Remark = [&](OptimizationRemark OR) {
1719  return OR << "Found a parallel region that is called in a target "
1720  "region but not part of a combined target construct nor "
1721  "nesed inside a target construct without intermediate "
1722  "code. This can lead to excessive register usage for "
1723  "unrelated target regions in the same translation unit "
1724  "due to spurious call edges assumed by ptxas.";
1725  };
1726  emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark);
1727  }
1728 
1729  // If this ever hits, we should investigate.
1730  // TODO: Checking the number of uses is not a necessary restriction and
1731  // should be lifted.
1732  if (UnknownUse || NumDirectCalls != 1 ||
1733  ToBeReplacedStateMachineUses.size() != 2) {
1734  {
1735  auto Remark = [&](OptimizationRemark OR) {
1736  return OR << "Parallel region is used in "
1737  << (UnknownUse ? "unknown" : "unexpected")
1738  << " ways; will not attempt to rewrite the state machine.";
1739  };
1740  emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark);
1741  }
1742  continue;
1743  }
1744 
1745  // Even if we have __kmpc_kernel_prepare_parallel calls, we (for now) give
1746  // up if the function is not called from a unique kernel.
1747  Kernel K = getUniqueKernelFor(*F);
1748  if (!K) {
1749  {
1750  auto Remark = [&](OptimizationRemark OR) {
1751  return OR << "Parallel region is not known to be called from a "
1752  "unique single target region, maybe the surrounding "
1753  "function has external linkage?; will not attempt to "
1754  "rewrite the state machine use.";
1755  };
1756  emitRemarkOnFunction(F, "OpenMPParallelRegionInMultipleKernesl",
1757  Remark);
1758  }
1759  continue;
1760  }
1761 
1762  // We now know F is a parallel body function called only from the kernel K.
1763  // We also identified the state machine uses in which we replace the
1764  // function pointer by a new global symbol for identification purposes. This
1765  // ensures only direct calls to the function are left.
1766 
1767  {
1768  auto RemarkParalleRegion = [&](OptimizationRemark OR) {
1769  return OR << "Specialize parallel region that is only reached from a "
1770  "single target region to avoid spurious call edges and "
1771  "excessive register usage in other target regions. "
1772  "(parallel region ID: "
1773  << ore::NV("OpenMPParallelRegion", F->getName())
1774  << ", kernel ID: "
1775  << ore::NV("OpenMPTargetRegion", K->getName()) << ")";
1776  };
1777  emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD",
1778  RemarkParalleRegion);
1779  auto RemarkKernel = [&](OptimizationRemark OR) {
1780  return OR << "Target region containing the parallel region that is "
1781  "specialized. (parallel region ID: "
1782  << ore::NV("OpenMPParallelRegion", F->getName())
1783  << ", kernel ID: "
1784  << ore::NV("OpenMPTargetRegion", K->getName()) << ")";
1785  };
1786  emitRemarkOnFunction(K, "OpenMPParallelRegionInNonSPMD", RemarkKernel);
1787  }
1788 
1789  Module &M = *F->getParent();
1790  Type *Int8Ty = Type::getInt8Ty(M.getContext());
1791 
1792  auto *ID = new GlobalVariable(
1793  M, Int8Ty, /* isConstant */ true, GlobalValue::PrivateLinkage,
1794  UndefValue::get(Int8Ty), F->getName() + ".ID");
1795 
1796  for (Use *U : ToBeReplacedStateMachineUses)
1797  U->set(ConstantExpr::getBitCast(ID, U->get()->getType()));
1798 
1799  ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
1800 
1801  Changed = true;
1802  }
1803 
1804  return Changed;
1805 }
1806 
1807 /// Abstract Attribute for tracking ICV values.
1808 struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {
1810  AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
1811 
1812  void initialize(Attributor &A) override {
1813  Function *F = getAnchorScope();
1814  if (!F || !A.isFunctionIPOAmendable(*F))
1815  indicatePessimisticFixpoint();
1816  }
1817 
1818  /// Returns true if value is assumed to be tracked.
1819  bool isAssumedTracked() const { return getAssumed(); }
1820 
1821  /// Returns true if value is known to be tracked.
1822  bool isKnownTracked() const { return getAssumed(); }
1823 
1824  /// Create an abstract attribute biew for the position \p IRP.
1825  static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A);
1826 
1827  /// Return the value with which \p I can be replaced for specific \p ICV.
1828  virtual Optional<Value *> getReplacementValue(InternalControlVar ICV,
1829  const Instruction *I,
1830  Attributor &A) const {
1831  return None;
1832  }
1833 
1834  /// Return an assumed unique ICV value if a single candidate is found. If
1835  /// there cannot be one, return a nullptr. If it is not clear yet, return the
1836  /// Optional::NoneType.
1837  virtual Optional<Value *>
1838  getUniqueReplacementValue(InternalControlVar ICV) const = 0;
1839 
1840  // Currently only nthreads is being tracked.
1841  // this array will only grow with time.
1842  InternalControlVar TrackableICVs[1] = {ICV_nthreads};
1843 
1844  /// See AbstractAttribute::getName()
1845  const std::string getName() const override { return "AAICVTracker"; }
1846 
1847  /// See AbstractAttribute::getIdAddr()
1848  const char *getIdAddr() const override { return &ID; }
1849 
1850  /// This function should return true if the type of the \p AA is AAICVTracker
1851  static bool classof(const AbstractAttribute *AA) {
1852  return (AA->getIdAddr() == &ID);
1853  }
1854 
1855  static const char ID;
1856 };
1857 
1858 struct AAICVTrackerFunction : public AAICVTracker {
1859  AAICVTrackerFunction(const IRPosition &IRP, Attributor &A)
1860  : AAICVTracker(IRP, A) {}
1861 
1862  // FIXME: come up with better string.
1863  const std::string getAsStr() const override { return "ICVTrackerFunction"; }
1864 
1865  // FIXME: come up with some stats.
1866  void trackStatistics() const override {}
1867 
1868  /// We don't manifest anything for this AA.
1869  ChangeStatus manifest(Attributor &A) override {
1870  return ChangeStatus::UNCHANGED;
1871  }
1872 
1873  // Map of ICV to their values at specific program point.
1875  InternalControlVar::ICV___last>
1876  ICVReplacementValuesMap;
1877 
1878  ChangeStatus updateImpl(Attributor &A) override {
1880 
1881  Function *F = getAnchorScope();
1882 
1883  auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
1884 
1885  for (InternalControlVar ICV : TrackableICVs) {
1886  auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
1887 
1888  auto &ValuesMap = ICVReplacementValuesMap[ICV];
1889  auto TrackValues = [&](Use &U, Function &) {
1890  CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
1891  if (!CI)
1892  return false;
1893 
1894  // FIXME: handle setters with more that 1 arguments.
1895  /// Track new value.
1896  if (ValuesMap.insert(std::make_pair(CI, CI->getArgOperand(0))).second)
1897  HasChanged = ChangeStatus::CHANGED;
1898 
1899  return false;
1900  };
1901 
1902  auto CallCheck = [&](Instruction &I) {
1903  Optional<Value *> ReplVal = getValueForCall(A, &I, ICV);
1904  if (ReplVal.hasValue() &&
1905  ValuesMap.insert(std::make_pair(&I, *ReplVal)).second)
1906  HasChanged = ChangeStatus::CHANGED;
1907 
1908  return true;
1909  };
1910 
1911  // Track all changes of an ICV.
1912  SetterRFI.foreachUse(TrackValues, F);
1913 
1914  A.checkForAllInstructions(CallCheck, *this, {Instruction::Call},
1915  /* CheckBBLivenessOnly */ true);
1916 
1917  /// TODO: Figure out a way to avoid adding entry in
1918  /// ICVReplacementValuesMap
1919  Instruction *Entry = &F->getEntryBlock().front();
1920  if (HasChanged == ChangeStatus::CHANGED && !ValuesMap.count(Entry))
1921  ValuesMap.insert(std::make_pair(Entry, nullptr));
1922  }
1923 
1924  return HasChanged;
1925  }
1926 
1927  /// Hepler to check if \p I is a call and get the value for it if it is
1928  /// unique.
1929  Optional<Value *> getValueForCall(Attributor &A, const Instruction *I,
1930  InternalControlVar &ICV) const {
1931 
1932  const auto *CB = dyn_cast<CallBase>(I);
1933  if (!CB || CB->hasFnAttr("no_openmp") ||
1934  CB->hasFnAttr("no_openmp_routines"))
1935  return None;
1936 
1937  auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
1938  auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
1939  auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
1940  Function *CalledFunction = CB->getCalledFunction();
1941 
1942  // Indirect call, assume ICV changes.
1943  if (CalledFunction == nullptr)
1944  return nullptr;
1945  if (CalledFunction == GetterRFI.Declaration)
1946  return None;
1947  if (CalledFunction == SetterRFI.Declaration) {
1948  if (ICVReplacementValuesMap[ICV].count(I))
1949  return ICVReplacementValuesMap[ICV].lookup(I);
1950 
1951  return nullptr;
1952  }
1953 
1954  // Since we don't know, assume it changes the ICV.
1955  if (CalledFunction->isDeclaration())
1956  return nullptr;
1957 
1958  const auto &ICVTrackingAA =
1959  A.getAAFor<AAICVTracker>(*this, IRPosition::callsite_returned(*CB));
1960 
1961  if (ICVTrackingAA.isAssumedTracked())
1962  return ICVTrackingAA.getUniqueReplacementValue(ICV);
1963 
1964  // If we don't know, assume it changes.
1965  return nullptr;
1966  }
1967 
1968  // We don't check unique value for a function, so return None.
1970  getUniqueReplacementValue(InternalControlVar ICV) const override {
1971  return None;
1972  }
1973 
1974  /// Return the value with which \p I can be replaced for specific \p ICV.
1975  Optional<Value *> getReplacementValue(InternalControlVar ICV,
1976  const Instruction *I,
1977  Attributor &A) const override {
1978  const auto &ValuesMap = ICVReplacementValuesMap[ICV];
1979  if (ValuesMap.count(I))
1980  return ValuesMap.lookup(I);
1981 
1984  Worklist.push_back(I);
1985 
1986  Optional<Value *> ReplVal;
1987 
1988  while (!Worklist.empty()) {
1989  const Instruction *CurrInst = Worklist.pop_back_val();
1990  if (!Visited.insert(CurrInst).second)
1991  continue;
1992 
1993  const BasicBlock *CurrBB = CurrInst->getParent();
1994 
1995  // Go up and look for all potential setters/calls that might change the
1996  // ICV.
1997  while ((CurrInst = CurrInst->getPrevNode())) {
1998  if (ValuesMap.count(CurrInst)) {
1999  Optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst);
2000  // Unknown value, track new.
2001  if (!ReplVal.hasValue()) {
2002  ReplVal = NewReplVal;
2003  break;
2004  }
2005 
2006  // If we found a new value, we can't know the icv value anymore.
2007  if (NewReplVal.hasValue())
2008  if (ReplVal != NewReplVal)
2009  return nullptr;
2010 
2011  break;
2012  }
2013 
2014  Optional<Value *> NewReplVal = getValueForCall(A, CurrInst, ICV);
2015  if (!NewReplVal.hasValue())
2016  continue;
2017 
2018  // Unknown value, track new.
2019  if (!ReplVal.hasValue()) {
2020  ReplVal = NewReplVal;
2021  break;
2022  }
2023 
2024  // if (NewReplVal.hasValue())
2025  // We found a new value, we can't know the icv value anymore.
2026  if (ReplVal != NewReplVal)
2027  return nullptr;
2028  }
2029 
2030  // If we are in the same BB and we have a value, we are done.
2031  if (CurrBB == I->getParent() && ReplVal.hasValue())
2032  return ReplVal;
2033 
2034  // Go through all predecessors and add terminators for analysis.
2035  for (const BasicBlock *Pred : predecessors(CurrBB))
2036  if (const Instruction *Terminator = Pred->getTerminator())
2037  Worklist.push_back(Terminator);
2038  }
2039 
2040  return ReplVal;
2041  }
2042 };
2043 
2044 struct AAICVTrackerFunctionReturned : AAICVTracker {
2045  AAICVTrackerFunctionReturned(const IRPosition &IRP, Attributor &A)
2046  : AAICVTracker(IRP, A) {}
2047 
2048  // FIXME: come up with better string.
2049  const std::string getAsStr() const override {
2050  return "ICVTrackerFunctionReturned";
2051  }
2052 
2053  // FIXME: come up with some stats.
2054  void trackStatistics() const override {}
2055 
2056  /// We don't manifest anything for this AA.
2057  ChangeStatus manifest(Attributor &A) override {
2058  return ChangeStatus::UNCHANGED;
2059  }
2060 
2061  // Map of ICV to their values at specific program point.
2063  InternalControlVar::ICV___last>
2064  ICVReplacementValuesMap;
2065 
2066  /// Return the value with which \p I can be replaced for specific \p ICV.
2068  getUniqueReplacementValue(InternalControlVar ICV) const override {
2069  return ICVReplacementValuesMap[ICV];
2070  }
2071 
2072  ChangeStatus updateImpl(Attributor &A) override {
2074  const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
2075  *this, IRPosition::function(*getAnchorScope()));
2076 
2077  if (!ICVTrackingAA.isAssumedTracked())
2078  return indicatePessimisticFixpoint();
2079 
2080  for (InternalControlVar ICV : TrackableICVs) {
2081  Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2082  Optional<Value *> UniqueICVValue;
2083 
2084  auto CheckReturnInst = [&](Instruction &I) {
2085  Optional<Value *> NewReplVal =
2086  ICVTrackingAA.getReplacementValue(ICV, &I, A);
2087 
2088  // If we found a second ICV value there is no unique returned value.
2089  if (UniqueICVValue.hasValue() && UniqueICVValue != NewReplVal)
2090  return false;
2091 
2092  UniqueICVValue = NewReplVal;
2093 
2094  return true;
2095  };
2096 
2097  if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret},
2098  /* CheckBBLivenessOnly */ true))
2099  UniqueICVValue = nullptr;
2100 
2101  if (UniqueICVValue == ReplVal)
2102  continue;
2103 
2104  ReplVal = UniqueICVValue;
2105  Changed = ChangeStatus::CHANGED;
2106  }
2107 
2108  return Changed;
2109  }
2110 };
2111 
2112 struct AAICVTrackerCallSite : AAICVTracker {
2113  AAICVTrackerCallSite(const IRPosition &IRP, Attributor &A)
2114  : AAICVTracker(IRP, A) {}
2115 
2116  void initialize(Attributor &A) override {
2117  Function *F = getAnchorScope();
2118  if (!F || !A.isFunctionIPOAmendable(*F))
2119  indicatePessimisticFixpoint();
2120 
2121  // We only initialize this AA for getters, so we need to know which ICV it
2122  // gets.
2123  auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2124  for (InternalControlVar ICV : TrackableICVs) {
2125  auto ICVInfo = OMPInfoCache.ICVs[ICV];
2126  auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter];
2127  if (Getter.Declaration == getAssociatedFunction()) {
2128  AssociatedICV = ICVInfo.Kind;
2129  return;
2130  }
2131  }
2132 
2133  /// Unknown ICV.
2134  indicatePessimisticFixpoint();
2135  }
2136 
2137  ChangeStatus manifest(Attributor &A) override {
2138  if (!ReplVal.hasValue() || !ReplVal.getValue())
2139  return ChangeStatus::UNCHANGED;
2140 
2141  A.changeValueAfterManifest(*getCtxI(), **ReplVal);
2142  A.deleteAfterManifest(*getCtxI());
2143 
2144  return ChangeStatus::CHANGED;
2145  }
2146 
2147  // FIXME: come up with better string.
2148  const std::string getAsStr() const override { return "ICVTrackerCallSite"; }
2149 
2150  // FIXME: come up with some stats.
2151  void trackStatistics() const override {}
2152 
2153  InternalControlVar AssociatedICV;
2154  Optional<Value *> ReplVal;
2155 
2156  ChangeStatus updateImpl(Attributor &A) override {
2157  const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
2158  *this, IRPosition::function(*getAnchorScope()));
2159 
2160  // We don't have any information, so we assume it changes the ICV.
2161  if (!ICVTrackingAA.isAssumedTracked())
2162  return indicatePessimisticFixpoint();
2163 
2164  Optional<Value *> NewReplVal =
2165  ICVTrackingAA.getReplacementValue(AssociatedICV, getCtxI(), A);
2166 
2167  if (ReplVal == NewReplVal)
2168  return ChangeStatus::UNCHANGED;
2169 
2170  ReplVal = NewReplVal;
2171  return ChangeStatus::CHANGED;
2172  }
2173 
2174  // Return the value with which associated value can be replaced for specific
2175  // \p ICV.
2177  getUniqueReplacementValue(InternalControlVar ICV) const override {
2178  return ReplVal;
2179  }
2180 };
2181 
2182 struct AAICVTrackerCallSiteReturned : AAICVTracker {
2183  AAICVTrackerCallSiteReturned(const IRPosition &IRP, Attributor &A)
2184  : AAICVTracker(IRP, A) {}
2185 
2186  // FIXME: come up with better string.
2187  const std::string getAsStr() const override {
2188  return "ICVTrackerCallSiteReturned";
2189  }
2190 
2191  // FIXME: come up with some stats.
2192  void trackStatistics() const override {}
2193 
2194  /// We don't manifest anything for this AA.
2195  ChangeStatus manifest(Attributor &A) override {
2196  return ChangeStatus::UNCHANGED;
2197  }
2198 
2199  // Map of ICV to their values at specific program point.
2201  InternalControlVar::ICV___last>
2202  ICVReplacementValuesMap;
2203 
2204  /// Return the value with which associated value can be replaced for specific
2205  /// \p ICV.
2207  getUniqueReplacementValue(InternalControlVar ICV) const override {
2208  return ICVReplacementValuesMap[ICV];
2209  }
2210 
2211  ChangeStatus updateImpl(Attributor &A) override {
2213  const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
2214  *this, IRPosition::returned(*getAssociatedFunction()));
2215 
2216  // We don't have any information, so we assume it changes the ICV.
2217  if (!ICVTrackingAA.isAssumedTracked())
2218  return indicatePessimisticFixpoint();
2219 
2220  for (InternalControlVar ICV : TrackableICVs) {
2221  Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2222  Optional<Value *> NewReplVal =
2223  ICVTrackingAA.getUniqueReplacementValue(ICV);
2224 
2225  if (ReplVal == NewReplVal)
2226  continue;
2227 
2228  ReplVal = NewReplVal;
2229  Changed = ChangeStatus::CHANGED;
2230  }
2231  return Changed;
2232  }
2233 };
2234 } // namespace
2235 
2236 const char AAICVTracker::ID = 0;
2237 
2238 AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP,
2239  Attributor &A) {
2240  AAICVTracker *AA = nullptr;
2241  switch (IRP.getPositionKind()) {
2243  case IRPosition::IRP_FLOAT:
2246  llvm_unreachable("ICVTracker can only be created for function position!");
2248  AA = new (A.Allocator) AAICVTrackerFunctionReturned(IRP, A);
2249  break;
2251  AA = new (A.Allocator) AAICVTrackerCallSiteReturned(IRP, A);
2252  break;
2254  AA = new (A.Allocator) AAICVTrackerCallSite(IRP, A);
2255  break;
2257  AA = new (A.Allocator) AAICVTrackerFunction(IRP, A);
2258  break;
2259  }
2260 
2261  return *AA;
2262 }
2263 
2266  LazyCallGraph &CG, CGSCCUpdateResult &UR) {
2267  if (!containsOpenMP(*C.begin()->getFunction().getParent(), OMPInModule))
2268  return PreservedAnalyses::all();
2269 
2271  return PreservedAnalyses::all();
2272 
2274  // If there are kernels in the module, we have to run on all SCC's.
2275  bool SCCIsInteresting = !OMPInModule.getKernels().empty();
2276  for (LazyCallGraph::Node &N : C) {
2277  Function *Fn = &N.getFunction();
2278  SCC.push_back(Fn);
2279 
2280  // Do we already know that the SCC contains kernels,
2281  // or that OpenMP functions are called from this SCC?
2282  if (SCCIsInteresting)
2283  continue;
2284  // If not, let's check that.
2285  SCCIsInteresting |= OMPInModule.containsOMPRuntimeCalls(Fn);
2286  }
2287 
2288  if (!SCCIsInteresting || SCC.empty())
2289  return PreservedAnalyses::all();
2290 
2292  AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
2293 
2294  AnalysisGetter AG(FAM);
2295 
2296  auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & {
2298  };
2299 
2300  CallGraphUpdater CGUpdater;
2301  CGUpdater.initialize(CG, C, AM, UR);
2302 
2303  SetVector<Function *> Functions(SCC.begin(), SCC.end());
2305  OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
2306  /*CGSCC*/ Functions, OMPInModule.getKernels());
2307 
2308  Attributor A(Functions, InfoCache, CGUpdater);
2309 
2310  OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
2311  bool Changed = OMPOpt.run();
2312  if (Changed)
2313  return PreservedAnalyses::none();
2314 
2315  return PreservedAnalyses::all();
2316 }
2317 
2318 namespace {
2319 
2320 struct OpenMPOptLegacyPass : public CallGraphSCCPass {
2321  CallGraphUpdater CGUpdater;
2322  OpenMPInModule OMPInModule;
2323  static char ID;
2324 
2325  OpenMPOptLegacyPass() : CallGraphSCCPass(ID) {
2327  }
2328 
2329  void getAnalysisUsage(AnalysisUsage &AU) const override {
2331  }
2332 
2333  bool doInitialization(CallGraph &CG) override {
2334  // Disable the pass if there is no OpenMP (runtime call) in the module.
2335  containsOpenMP(CG.getModule(), OMPInModule);
2336  return false;
2337  }
2338 
2339  bool runOnSCC(CallGraphSCC &CGSCC) override {
2340  if (!containsOpenMP(CGSCC.getCallGraph().getModule(), OMPInModule))
2341  return false;
2342  if (DisableOpenMPOptimizations || skipSCC(CGSCC))
2343  return false;
2344 
2346  // If there are kernels in the module, we have to run on all SCC's.
2347  bool SCCIsInteresting = !OMPInModule.getKernels().empty();
2348  for (CallGraphNode *CGN : CGSCC) {
2349  Function *Fn = CGN->getFunction();
2350  if (!Fn || Fn->isDeclaration())
2351  continue;
2352  SCC.push_back(Fn);
2353 
2354  // Do we already know that the SCC contains kernels,
2355  // or that OpenMP functions are called from this SCC?
2356  if (SCCIsInteresting)
2357  continue;
2358  // If not, let's check that.
2359  SCCIsInteresting |= OMPInModule.containsOMPRuntimeCalls(Fn);
2360  }
2361 
2362  if (!SCCIsInteresting || SCC.empty())
2363  return false;
2364 
2365  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
2366  CGUpdater.initialize(CG, CGSCC);
2367 
2368  // Maintain a map of functions to avoid rebuilding the ORE
2370  auto OREGetter = [&OREMap](Function *F) -> OptimizationRemarkEmitter & {
2371  std::unique_ptr<OptimizationRemarkEmitter> &ORE = OREMap[F];
2372  if (!ORE)
2373  ORE = std::make_unique<OptimizationRemarkEmitter>(F);
2374  return *ORE;
2375  };
2376 
2377  AnalysisGetter AG;
2378  SetVector<Function *> Functions(SCC.begin(), SCC.end());
2380  OMPInformationCache InfoCache(
2381  *(Functions.back()->getParent()), AG, Allocator,
2382  /*CGSCC*/ Functions, OMPInModule.getKernels());
2383 
2384  Attributor A(Functions, InfoCache, CGUpdater);
2385 
2386  OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
2387  return OMPOpt.run();
2388  }
2389 
2390  bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); }
2391 };
2392 
2393 } // end anonymous namespace
2394 
2396 
2397  NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
2398  if (!MD)
2399  return;
2400 
2401  for (auto *Op : MD->operands()) {
2402  if (Op->getNumOperands() < 2)
2403  continue;
2404  MDString *KindID = dyn_cast<MDString>(Op->getOperand(1));
2405  if (!KindID || KindID->getString() != "kernel")
2406  continue;
2407 
2408  Function *KernelFn =
2409  mdconst::dyn_extract_or_null<Function>(Op->getOperand(0));
2410  if (!KernelFn)
2411  continue;
2412 
2413  ++NumOpenMPTargetRegionKernels;
2414 
2415  Kernels.insert(KernelFn);
2416  }
2417 }
2418 
2420  if (OMPInModule.isKnown())
2421  return OMPInModule;
2422 
2423  auto RecordFunctionsContainingUsesOf = [&](Function *F) {
2424  for (User *U : F->users())
2425  if (auto *I = dyn_cast<Instruction>(U))
2426  OMPInModule.FuncsWithOMPRuntimeCalls.insert(I->getFunction());
2427  };
2428 
2429  // MSVC doesn't like long if-else chains for some reason and instead just
2430  // issues an error. Work around it..
2431  do {
2432 #define OMP_RTL(_Enum, _Name, ...) \
2433  if (Function *F = M.getFunction(_Name)) { \
2434  RecordFunctionsContainingUsesOf(F); \
2435  OMPInModule = true; \
2436  }
2437 #include "llvm/Frontend/OpenMP/OMPKinds.def"
2438  } while (false);
2439 
2440  // Identify kernels once. TODO: We should split the OMPInformationCache into a
2441  // module and an SCC part. The kernel information, among other things, could
2442  // go into the module part.
2443  if (OMPInModule.isKnown() && OMPInModule) {
2444  OMPInModule.identifyKernels(M);
2445  return true;
2446  }
2447 
2448  return OMPInModule = false;
2449 }
2450 
2451 char OpenMPOptLegacyPass::ID = 0;
2452 
2453 INITIALIZE_PASS_BEGIN(OpenMPOptLegacyPass, "openmpopt",
2454  "OpenMP specific optimizations", false, false)
2456 INITIALIZE_PASS_END(OpenMPOptLegacyPass, "openmpopt",
2457  "OpenMP specific optimizations", false, false)
2458 
2459 Pass *llvm::createOpenMPOptLegacyPass() { return new OpenMPOptLegacyPass(); }
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:91
const Function & getFunction() const
Definition: Function.h:135
const NoneType None
Definition: None.h:23
uint64_t CallInst * C
bool containsOMPRuntimeCalls(Function *F) const
Does this function F contain any OpenMP runtime calls?
Definition: OpenMPOpt.h:37
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:1864
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:77
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:111
void initializeOpenMPOptLegacyPassPass(PassRegistry &)
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
Definition: OMPConstants.h:54
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
An attribute for a function argument.
Definition: Attributor.h:241
This class represents an incoming formal argument to a Function.
Definition: Argument.h:29
An attribute for the function return value.
Definition: Attributor.h:237
INITIALIZE_PASS_BEGIN(OpenMPOptLegacyPass, "openmpopt", "OpenMP specific optimizations", false, false) INITIALIZE_PASS_END(OpenMPOptLegacyPass
DiagnosticInfoOptimizationBase::Argument NV
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:288
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:785
SmallPtrSetImpl< Kernel > & getKernels()
Return the known kernels (=GPU entry points) in the module.
Definition: OpenMPOpt.h:42
This class represents lattice values for constants.
Definition: AllocatorList.h:23
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:77
void registerOutlinedFunction(Function &OriginalFn, Function &NewFn)
If a new function was created by outlining, this method can be called to update the call graph for th...
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:75
Function * getCaller()
Helper to get the caller (the parent function).
ChangeStatus
{
Definition: Attributor.h:139
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:164
static cl::opt< bool > DisableOpenMPOptimizations("openmp-opt-disable", cl::ZeroOrMore, cl::desc("Disable OpenMP specific optimizations."), cl::Hidden, cl::init(false))
void push_back(const T &Elt)
Definition: SmallVector.h:379
This class represents a function call, abstracting a target machine's calling convention.
An efficient, type-erasing, non-owning reference to a callable.
Definition: STLExtras.h:176
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:56
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition: OpenMPOpt.h:21
STATISTIC(NumFunctions, "Total number of functions")
A debug info location.
Definition: DebugLoc.h:33
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1164
F(f)
An instruction for reading from memory.
Definition: Instructions.h:174
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:148
void setSuccessor(unsigned Idx, BasicBlock *BB)
Update the specified successor to point at the provided block.
bool containsOpenMP(Module &M, OpenMPInModule &OMPInModule)
Helper to determine if M contains OpenMP (runtime calls).
Definition: OpenMPOpt.cpp:2419
Helper to tie a abstract state implementation to an abstract attribute.
Definition: Attributor.h:2160
OpenMP specific optimizations
Definition: OpenMPOpt.cpp:2456
Wrapper for FunctoinAnalysisManager.
Definition: Attributor.h:721
print alias Alias Set Printer
A proxy from a FunctionAnalysisManager to an SCC.
A node in the call graph for a module.
Definition: CallGraph.h:167
void getAnalysisUsage(AnalysisUsage &Info) const override
getAnalysisUsage - For this class, we declare that we require and preserve the call graph.
Support structure for SCC passes to communicate updates the call graph back to the CGSCC pass manager...
Module & getModule() const
Returns the module the call graph corresponds to.
Definition: CallGraph.h:102
Wrapper to unify "old style" CallGraph and "new style" LazyCallGraph.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:343
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:296
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1323
void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
A tuple of MDNodes.
Definition: Metadata.h:1350
void removeCallSite(CallBase &CS)
Remove the call site CS from the call graph.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
Definition: OMPIRBuilder.h:50
A Use represents the edge between a Value definition and its users.
Definition: Use.h:44
static cl::opt< bool > EnableParallelRegionMerging("openmp-opt-enable-merging", cl::ZeroOrMore, cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden, cl::init(false))
Diagnostic information for optimization analysis remarks.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:43
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
static StringRef getName(Value *V)
void identifyKernels(Module &M)
Identify kernels in the module and populate the Kernels set.
Definition: OpenMPOpt.cpp:2395
void assign(size_type NumElts, const T &Elt)
Definition: SmallVector.h:630
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
Class to represent function types.
Definition: DerivedTypes.h:102
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:246
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:141
A lazily constructed view of the call graph of a module.
#define DEBUG_TYPE
Definition: OpenMPOpt.cpp:36
This file provides interfaces used to manipulate a call graph, regardless if it is a "old style" Call...
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, ArrayRef< StringLiteral > StandardNames)
Initialize the set of available library functions based on the specified target triple.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
An instruction for storing to memory.
Definition: Instructions.h:303
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:523
unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:33
iterator_range< op_iterator > operands()
Definition: Metadata.h:1442
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:151
A position that is not associated with a spot suitable for attributes.
Definition: Attributor.h:235
void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:215
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: PassManager.h:158
void replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
Helper to remember if the module contains OpenMP (runtime calls), to be used foremost with containsOp...
Definition: OpenMPOpt.h:25
StringRef getString() const
Definition: Metadata.cpp:464
static Constant * getBitCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2173
#define P(N)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:427
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:155
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:249
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:362
The ModulePass which wraps up a CallGraph and the logic to build it.
Definition: CallGraph.h:337
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:298
void reanalyzeFunction(Function &Fn)
After an CGSCC pass changes a function in ways that affect the call graph, this method can be called ...
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:46
Allocate memory in an ever growing pool, as if by bump-pointer.
Definition: Allocator.h:67
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:156
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This is an important base class in LLVM.
Definition: Constant.h:41
LLVM_NODISCARD bool empty() const
Definition: SmallPtrSet.h:91
const Instruction & front() const
Definition: BasicBlock.h:308
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364
bool mayHaveSideEffects() const
Return true if the instruction may have side effects.
Definition: Instruction.h:626
Diagnostic information for applied optimization remarks.
Represent the analysis usage information of a pass.
static const IRPosition returned(const Function &F)
Create a position describing the returned value of F.
Definition: Attributor.h:265
bool isLifetimeStartOrEnd() const
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
The fixpoint analysis framework that orchestrates the attribute deduction.
Definition: Attributor.h:1010
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
A node in the call graph.
Argument * getArg(unsigned i) const
Definition: Function.h:780
self_iterator getIterator()
Definition: ilist_node.h:81
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:69
virtual const char * getIdAddr() const =0
This function should return the address of the ID of the AbstractAttribute.
An attribute for a call site return value.
Definition: Attributor.h:238
This file defines constans and helpers used when dealing with OpenMP.
Helper to describe and deal with positions in the LLVM-IR.
Definition: Attributor.h:230
Used in the streaming interface as the general argument type.
R600 Clause Merge
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1742
const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition: Value.cpp:630
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:161
Base struct for all "concrete attribute" deductions.
Definition: Attributor.h:2265
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1304
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
An attribute for a function (scope).
Definition: Attributor.h:239
size_type size() const
Definition: SmallPtrSet.h:92
Pass * createOpenMPOptLegacyPass()
createOpenMPOptLegacyPass - OpenMP specific optimizations.
Definition: OpenMPOpt.cpp:2459
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:73
static cl::opt< bool > PrintICVValues("openmp-print-icv-values", cl::init(false), cl::Hidden)
Basic Register Allocator
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1489
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:307
uint64_t Offset
An attribute for a call site argument.
Definition: Attributor.h:242
bool isCallee(Value::const_user_iterator UI) const
Determine whether the passed iterator points to the callee operand's Use.
Definition: InstrTypes.h:1386
This is the shared class of boolean and integer constants.
Definition: Constants.h:77
static cl::opt< bool > PrintOpenMPKernels("openmp-print-gpu-kernels", cl::init(false), cl::Hidden)
An interface to create LLVM-IR for OpenMP directives.
Definition: OMPIRBuilder.h:29
iterator end()
Definition: BasicBlock.h:298
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1116
Data structure to hold cached (LLVM-IR) information.
Definition: Attributor.h:748
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:595
An attribute for a call site (function scope).
Definition: Attributor.h:240
LLVM_READNONE bool isKernel(CallingConv::ID CC)
constexpr bool hasValue() const
Definition: Optional.h:263
static BranchInst * Create(BasicBlock *IfTrue, Instruction *InsertBefore=nullptr)
pred_range predecessors(BasicBlock *BB)
Definition: CFG.h:125
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
static const IRPosition callsite_returned(const CallBase &CB)
Create a position describing the returned value of CB.
Definition: Attributor.h:280
amdgpu Simplify well known AMD library false FunctionCallee Callee
constexpr char Kernels[]
Key for HSA::Metadata::mKernels.
InternalControlVar
IDs for all Internal Control Variables (ICVs).
Definition: OMPConstants.h:35
The basic data container for the call graph of a Module of IR.
Definition: CallGraph.h:73
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:365
iterator begin() const
Definition: SmallPtrSet.h:395
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
Definition: OMPIRBuilder.h:139
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:295
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation.
Definition: InstrTypes.h:1378
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
#define I(x, y, z)
Definition: MD5.cpp:59
#define N
BasicBlock * SplitBlock(BasicBlock *Old, Instruction *SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
bool mayReadFromMemory() const
Return true if this instruction may read memory.
openmpopt
Definition: OpenMPOpt.cpp:2456
An invalid position.
Definition: Attributor.h:234
static const IRPosition callsite_function(const CallBase &CB)
Create a position describing the function scope of CB.
Definition: Attributor.h:275
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1581
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:228
Value * getUnderlyingObject(Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
const BasicBlock & front() const
Definition: Function.h:754
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:607
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:572
LLVM Value Representation.
Definition: Value.h:75
An SCC of the call graph.
CallGraphSCC - This is a single SCC that a CallGraphSCCPass is run on.
A vector that has set insertion semantics.
Definition: SetVector.h:40
static constexpr auto TAG
Definition: OpenMPOpt.cpp:76
static const Function * getParent(const Value *V)
auto drop_begin(T &&RangeOrContainer, size_t N)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:275
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:57
A single uniqued string.
Definition: Metadata.h:602
A container for analyses that lazily runs them and caches their results.
static cl::opt< bool > HideMemoryTransferLatency("openmp-hide-memory-transfer-latency", cl::desc("[WIP] Tries to hide the latency of host to device memory" " transfers"), cl::Hidden, cl::init(false))
#define LLVM_DEBUG(X)
Definition: Debug.h:122
void initialize(CallGraph &CG, CallGraphSCC &SCC)
Initializers for usage outside of a CGSCC pass, inside a CGSCC pass in the old and new pass manager (...
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:195
The optimization diagnostic interface.
PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM, LazyCallGraph &CG, CGSCCUpdateResult &UR)
Definition: OpenMPOpt.cpp:2264
static const IRPosition function(const Function &F)
Create a position describing the function scope of F.
Definition: Attributor.h:260
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Kind getPositionKind() const
Return the associated position kind.
Definition: Attributor.h:448
const BasicBlock * getParent() const
Definition: Instruction.h:94
an instruction to allocate memory on the stack
Definition: Instructions.h:61
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
Definition: BasicBlock.cpp:302