LLVM  14.0.0git
OpenMPOpt.cpp
Go to the documentation of this file.
1 //===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // OpenMP specific optimizations:
10 //
11 // - Deduplication of runtime calls, e.g., omp_get_thread_num.
12 // - Replacing globalized device memory with stack memory.
13 // - Replacing globalized device memory with shared memory.
14 // - Parallel region merging.
15 // - Transforming generic-mode device kernels to SPMD mode.
16 // - Specializing the state machine for generic-mode device kernels.
17 //
18 //===----------------------------------------------------------------------===//
19 
21 
24 #include "llvm/ADT/Statistic.h"
25 #include "llvm/ADT/StringRef.h"
32 #include "llvm/IR/Assumptions.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/GlobalValue.h"
35 #include "llvm/IR/Instruction.h"
36 #include "llvm/IR/IntrinsicInst.h"
37 #include "llvm/IR/IntrinsicsAMDGPU.h"
38 #include "llvm/IR/IntrinsicsNVPTX.h"
39 #include "llvm/InitializePasses.h"
41 #include "llvm/Transforms/IPO.h"
46 
47 #include <algorithm>
48 
49 using namespace llvm;
50 using namespace omp;
51 
52 #define DEBUG_TYPE "openmp-opt"
53 
55  "openmp-opt-disable", cl::ZeroOrMore,
56  cl::desc("Disable OpenMP specific optimizations."), cl::Hidden,
57  cl::init(false));
58 
60  "openmp-opt-enable-merging", cl::ZeroOrMore,
61  cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden,
62  cl::init(false));
63 
64 static cl::opt<bool>
65  DisableInternalization("openmp-opt-disable-internalization", cl::ZeroOrMore,
66  cl::desc("Disable function internalization."),
67  cl::Hidden, cl::init(false));
68 
69 static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
70  cl::Hidden);
71 static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
72  cl::init(false), cl::Hidden);
73 
75  "openmp-hide-memory-transfer-latency",
76  cl::desc("[WIP] Tries to hide the latency of host to device memory"
77  " transfers"),
78  cl::Hidden, cl::init(false));
79 
81  "openmp-opt-disable-deglobalization", cl::ZeroOrMore,
82  cl::desc("Disable OpenMP optimizations involving deglobalization."),
83  cl::Hidden, cl::init(false));
84 
86  "openmp-opt-disable-spmdization", cl::ZeroOrMore,
87  cl::desc("Disable OpenMP optimizations involving SPMD-ization."),
88  cl::Hidden, cl::init(false));
89 
91  "openmp-opt-disable-folding", cl::ZeroOrMore,
92  cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden,
93  cl::init(false));
94 
96  "openmp-opt-disable-state-machine-rewrite", cl::ZeroOrMore,
97  cl::desc("Disable OpenMP optimizations that replace the state machine."),
98  cl::Hidden, cl::init(false));
99 
101  "openmp-opt-print-module", cl::ZeroOrMore,
102  cl::desc("Print the current module after OpenMP optimizations."),
103  cl::Hidden, cl::init(false));
104 
106  "openmp-opt-inline-device", cl::ZeroOrMore,
107  cl::desc("Inline all applicible functions on the device."), cl::Hidden,
108  cl::init(false));
109 
110 static cl::opt<bool>
111  EnableVerboseRemarks("openmp-opt-verbose-remarks", cl::ZeroOrMore,
112  cl::desc("Enables more verbose remarks."), cl::Hidden,
113  cl::init(false));
114 
115 static cl::opt<unsigned>
116  SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden,
117  cl::desc("Maximal number of attributor iterations."),
118  cl::init(256));
119 
120 STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
121  "Number of OpenMP runtime calls deduplicated");
122 STATISTIC(NumOpenMPParallelRegionsDeleted,
123  "Number of OpenMP parallel regions deleted");
124 STATISTIC(NumOpenMPRuntimeFunctionsIdentified,
125  "Number of OpenMP runtime functions identified");
126 STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified,
127  "Number of OpenMP runtime function uses identified");
128 STATISTIC(NumOpenMPTargetRegionKernels,
129  "Number of OpenMP target region entry points (=kernels) identified");
130 STATISTIC(NumOpenMPTargetRegionKernelsSPMD,
131  "Number of OpenMP target region entry points (=kernels) executed in "
132  "SPMD-mode instead of generic-mode");
133 STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine,
134  "Number of OpenMP target region entry points (=kernels) executed in "
135  "generic-mode without a state machines");
136 STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback,
137  "Number of OpenMP target region entry points (=kernels) executed in "
138  "generic-mode with customized state machines with fallback");
139 STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback,
140  "Number of OpenMP target region entry points (=kernels) executed in "
141  "generic-mode with customized state machines without fallback");
142 STATISTIC(
143  NumOpenMPParallelRegionsReplacedInGPUStateMachine,
144  "Number of OpenMP parallel regions replaced with ID in GPU state machines");
145 STATISTIC(NumOpenMPParallelRegionsMerged,
146  "Number of OpenMP parallel regions merged");
147 STATISTIC(NumBytesMovedToSharedMemory,
148  "Amount of memory pushed to shared memory");
149 
150 #if !defined(NDEBUG)
151 static constexpr auto TAG = "[" DEBUG_TYPE "]";
152 #endif
153 
154 namespace {
155 
156 enum class AddressSpace : unsigned {
157  Generic = 0,
158  Global = 1,
159  Shared = 3,
160  Constant = 4,
161  Local = 5,
162 };
163 
164 struct AAHeapToShared;
165 
166 struct AAICVTracker;
167 
168 /// OpenMP specific information. For now, stores RFIs and ICVs also needed for
169 /// Attributor runs.
170 struct OMPInformationCache : public InformationCache {
171  OMPInformationCache(Module &M, AnalysisGetter &AG,
174  : InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M),
175  Kernels(Kernels) {
176 
177  OMPBuilder.initialize();
178  initializeRuntimeFunctions();
179  initializeInternalControlVars();
180  }
181 
182  /// Generic information that describes an internal control variable.
183  struct InternalControlVarInfo {
184  /// The kind, as described by InternalControlVar enum.
186 
187  /// The name of the ICV.
188  StringRef Name;
189 
190  /// Environment variable associated with this ICV.
191  StringRef EnvVarName;
192 
193  /// Initial value kind.
194  ICVInitValue InitKind;
195 
196  /// Initial value.
197  ConstantInt *InitValue;
198 
199  /// Setter RTL function associated with this ICV.
200  RuntimeFunction Setter;
201 
202  /// Getter RTL function associated with this ICV.
203  RuntimeFunction Getter;
204 
205  /// RTL Function corresponding to the override clause of this ICV
207  };
208 
209  /// Generic information that describes a runtime function
210  struct RuntimeFunctionInfo {
211 
212  /// The kind, as described by the RuntimeFunction enum.
214 
215  /// The name of the function.
216  StringRef Name;
217 
218  /// Flag to indicate a variadic function.
219  bool IsVarArg;
220 
221  /// The return type of the function.
222  Type *ReturnType;
223 
224  /// The argument types of the function.
225  SmallVector<Type *, 8> ArgumentTypes;
226 
227  /// The declaration if available.
228  Function *Declaration = nullptr;
229 
230  /// Uses of this runtime function per function containing the use.
231  using UseVector = SmallVector<Use *, 16>;
232 
233  /// Clear UsesMap for runtime function.
234  void clearUsesMap() { UsesMap.clear(); }
235 
236  /// Boolean conversion that is true if the runtime function was found.
237  operator bool() const { return Declaration; }
238 
239  /// Return the vector of uses in function \p F.
240  UseVector &getOrCreateUseVector(Function *F) {
241  std::shared_ptr<UseVector> &UV = UsesMap[F];
242  if (!UV)
243  UV = std::make_shared<UseVector>();
244  return *UV;
245  }
246 
247  /// Return the vector of uses in function \p F or `nullptr` if there are
248  /// none.
249  const UseVector *getUseVector(Function &F) const {
250  auto I = UsesMap.find(&F);
251  if (I != UsesMap.end())
252  return I->second.get();
253  return nullptr;
254  }
255 
256  /// Return how many functions contain uses of this runtime function.
257  size_t getNumFunctionsWithUses() const { return UsesMap.size(); }
258 
259  /// Return the number of arguments (or the minimal number for variadic
260  /// functions).
261  size_t getNumArgs() const { return ArgumentTypes.size(); }
262 
263  /// Run the callback \p CB on each use and forget the use if the result is
264  /// true. The callback will be fed the function in which the use was
265  /// encountered as second argument.
266  void foreachUse(SmallVectorImpl<Function *> &SCC,
267  function_ref<bool(Use &, Function &)> CB) {
268  for (Function *F : SCC)
269  foreachUse(CB, F);
270  }
271 
272  /// Run the callback \p CB on each use within the function \p F and forget
273  /// the use if the result is true.
274  void foreachUse(function_ref<bool(Use &, Function &)> CB, Function *F) {
275  SmallVector<unsigned, 8> ToBeDeleted;
276  ToBeDeleted.clear();
277 
278  unsigned Idx = 0;
279  UseVector &UV = getOrCreateUseVector(F);
280 
281  for (Use *U : UV) {
282  if (CB(*U, *F))
283  ToBeDeleted.push_back(Idx);
284  ++Idx;
285  }
286 
287  // Remove the to-be-deleted indices in reverse order as prior
288  // modifications will not modify the smaller indices.
289  while (!ToBeDeleted.empty()) {
290  unsigned Idx = ToBeDeleted.pop_back_val();
291  UV[Idx] = UV.back();
292  UV.pop_back();
293  }
294  }
295 
296  private:
297  /// Map from functions to all uses of this runtime function contained in
298  /// them.
300 
301  public:
302  /// Iterators for the uses of this runtime function.
303  decltype(UsesMap)::iterator begin() { return UsesMap.begin(); }
304  decltype(UsesMap)::iterator end() { return UsesMap.end(); }
305  };
306 
307  /// An OpenMP-IR-Builder instance
308  OpenMPIRBuilder OMPBuilder;
309 
310  /// Map from runtime function kind to the runtime function description.
311  EnumeratedArray<RuntimeFunctionInfo, RuntimeFunction,
312  RuntimeFunction::OMPRTL___last>
313  RFIs;
314 
315  /// Map from function declarations/definitions to their runtime enum type.
316  DenseMap<Function *, RuntimeFunction> RuntimeFunctionIDMap;
317 
318  /// Map from ICV kind to the ICV description.
319  EnumeratedArray<InternalControlVarInfo, InternalControlVar,
320  InternalControlVar::ICV___last>
321  ICVs;
322 
323  /// Helper to initialize all internal control variable information for those
324  /// defined in OMPKinds.def.
325  void initializeInternalControlVars() {
326 #define ICV_RT_SET(_Name, RTL) \
327  { \
328  auto &ICV = ICVs[_Name]; \
329  ICV.Setter = RTL; \
330  }
331 #define ICV_RT_GET(Name, RTL) \
332  { \
333  auto &ICV = ICVs[Name]; \
334  ICV.Getter = RTL; \
335  }
336 #define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \
337  { \
338  auto &ICV = ICVs[Enum]; \
339  ICV.Name = _Name; \
340  ICV.Kind = Enum; \
341  ICV.InitKind = Init; \
342  ICV.EnvVarName = _EnvVarName; \
343  switch (ICV.InitKind) { \
344  case ICV_IMPLEMENTATION_DEFINED: \
345  ICV.InitValue = nullptr; \
346  break; \
347  case ICV_ZERO: \
348  ICV.InitValue = ConstantInt::get( \
349  Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \
350  break; \
351  case ICV_FALSE: \
352  ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \
353  break; \
354  case ICV_LAST: \
355  break; \
356  } \
357  }
358 #include "llvm/Frontend/OpenMP/OMPKinds.def"
359  }
360 
361  /// Returns true if the function declaration \p F matches the runtime
362  /// function types, that is, return type \p RTFRetType, and argument types
363  /// \p RTFArgTypes.
364  static bool declMatchesRTFTypes(Function *F, Type *RTFRetType,
365  SmallVector<Type *, 8> &RTFArgTypes) {
366  // TODO: We should output information to the user (under debug output
367  // and via remarks).
368 
369  if (!F)
370  return false;
371  if (F->getReturnType() != RTFRetType)
372  return false;
373  if (F->arg_size() != RTFArgTypes.size())
374  return false;
375 
376  auto *RTFTyIt = RTFArgTypes.begin();
377  for (Argument &Arg : F->args()) {
378  if (Arg.getType() != *RTFTyIt)
379  return false;
380 
381  ++RTFTyIt;
382  }
383 
384  return true;
385  }
386 
387  // Helper to collect all uses of the declaration in the UsesMap.
388  unsigned collectUses(RuntimeFunctionInfo &RFI, bool CollectStats = true) {
389  unsigned NumUses = 0;
390  if (!RFI.Declaration)
391  return NumUses;
392  OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration);
393 
394  if (CollectStats) {
395  NumOpenMPRuntimeFunctionsIdentified += 1;
396  NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
397  }
398 
399  // TODO: We directly convert uses into proper calls and unknown uses.
400  for (Use &U : RFI.Declaration->uses()) {
401  if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) {
402  if (ModuleSlice.count(UserI->getFunction())) {
403  RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
404  ++NumUses;
405  }
406  } else {
407  RFI.getOrCreateUseVector(nullptr).push_back(&U);
408  ++NumUses;
409  }
410  }
411  return NumUses;
412  }
413 
414  // Helper function to recollect uses of a runtime function.
415  void recollectUsesForFunction(RuntimeFunction RTF) {
416  auto &RFI = RFIs[RTF];
417  RFI.clearUsesMap();
418  collectUses(RFI, /*CollectStats*/ false);
419  }
420 
421  // Helper function to recollect uses of all runtime functions.
422  void recollectUses() {
423  for (int Idx = 0; Idx < RFIs.size(); ++Idx)
424  recollectUsesForFunction(static_cast<RuntimeFunction>(Idx));
425  }
426 
427  /// Helper to initialize all runtime function information for those defined
428  /// in OpenMPKinds.def.
429  void initializeRuntimeFunctions() {
430  Module &M = *((*ModuleSlice.begin())->getParent());
431 
432  // Helper macros for handling __VA_ARGS__ in OMP_RTL
433 #define OMP_TYPE(VarName, ...) \
434  Type *VarName = OMPBuilder.VarName; \
435  (void)VarName;
436 
437 #define OMP_ARRAY_TYPE(VarName, ...) \
438  ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \
439  (void)VarName##Ty; \
440  PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \
441  (void)VarName##PtrTy;
442 
443 #define OMP_FUNCTION_TYPE(VarName, ...) \
444  FunctionType *VarName = OMPBuilder.VarName; \
445  (void)VarName; \
446  PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
447  (void)VarName##Ptr;
448 
449 #define OMP_STRUCT_TYPE(VarName, ...) \
450  StructType *VarName = OMPBuilder.VarName; \
451  (void)VarName; \
452  PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
453  (void)VarName##Ptr;
454 
455 #define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \
456  { \
457  SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \
458  Function *F = M.getFunction(_Name); \
459  RTLFunctions.insert(F); \
460  if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \
461  RuntimeFunctionIDMap[F] = _Enum; \
462  F->removeFnAttr(Attribute::NoInline); \
463  auto &RFI = RFIs[_Enum]; \
464  RFI.Kind = _Enum; \
465  RFI.Name = _Name; \
466  RFI.IsVarArg = _IsVarArg; \
467  RFI.ReturnType = OMPBuilder._ReturnType; \
468  RFI.ArgumentTypes = std::move(ArgsTypes); \
469  RFI.Declaration = F; \
470  unsigned NumUses = collectUses(RFI); \
471  (void)NumUses; \
472  LLVM_DEBUG({ \
473  dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \
474  << " found\n"; \
475  if (RFI.Declaration) \
476  dbgs() << TAG << "-> got " << NumUses << " uses in " \
477  << RFI.getNumFunctionsWithUses() \
478  << " different functions.\n"; \
479  }); \
480  } \
481  }
482 #include "llvm/Frontend/OpenMP/OMPKinds.def"
483 
484  // TODO: We should attach the attributes defined in OMPKinds.def.
485  }
486 
487  /// Collection of known kernels (\see Kernel) in the module.
489 
490  /// Collection of known OpenMP runtime functions..
491  DenseSet<const Function *> RTLFunctions;
492 };
493 
494 template <typename Ty, bool InsertInvalidates = true>
495 struct BooleanStateWithSetVector : public BooleanState {
496  bool contains(const Ty &Elem) const { return Set.contains(Elem); }
497  bool insert(const Ty &Elem) {
498  if (InsertInvalidates)
500  return Set.insert(Elem);
501  }
502 
503  const Ty &operator[](int Idx) const { return Set[Idx]; }
504  bool operator==(const BooleanStateWithSetVector &RHS) const {
505  return BooleanState::operator==(RHS) && Set == RHS.Set;
506  }
507  bool operator!=(const BooleanStateWithSetVector &RHS) const {
508  return !(*this == RHS);
509  }
510 
511  bool empty() const { return Set.empty(); }
512  size_t size() const { return Set.size(); }
513 
514  /// "Clamp" this state with \p RHS.
515  BooleanStateWithSetVector &operator^=(const BooleanStateWithSetVector &RHS) {
517  Set.insert(RHS.Set.begin(), RHS.Set.end());
518  return *this;
519  }
520 
521 private:
522  /// A set to keep track of elements.
523  SetVector<Ty> Set;
524 
525 public:
526  typename decltype(Set)::iterator begin() { return Set.begin(); }
527  typename decltype(Set)::iterator end() { return Set.end(); }
528  typename decltype(Set)::const_iterator begin() const { return Set.begin(); }
529  typename decltype(Set)::const_iterator end() const { return Set.end(); }
530 };
531 
532 template <typename Ty, bool InsertInvalidates = true>
533 using BooleanStateWithPtrSetVector =
534  BooleanStateWithSetVector<Ty *, InsertInvalidates>;
535 
536 struct KernelInfoState : AbstractState {
537  /// Flag to track if we reached a fixpoint.
538  bool IsAtFixpoint = false;
539 
540  /// The parallel regions (identified by the outlined parallel functions) that
541  /// can be reached from the associated function.
542  BooleanStateWithPtrSetVector<Function, /* InsertInvalidates */ false>
543  ReachedKnownParallelRegions;
544 
545  /// State to track what parallel region we might reach.
546  BooleanStateWithPtrSetVector<CallBase> ReachedUnknownParallelRegions;
547 
548  /// State to track if we are in SPMD-mode, assumed or know, and why we decided
549  /// we cannot be. If it is assumed, then RequiresFullRuntime should also be
550  /// false.
551  BooleanStateWithPtrSetVector<Instruction, false> SPMDCompatibilityTracker;
552 
553  /// The __kmpc_target_init call in this kernel, if any. If we find more than
554  /// one we abort as the kernel is malformed.
555  CallBase *KernelInitCB = nullptr;
556 
557  /// The __kmpc_target_deinit call in this kernel, if any. If we find more than
558  /// one we abort as the kernel is malformed.
559  CallBase *KernelDeinitCB = nullptr;
560 
561  /// Flag to indicate if the associated function is a kernel entry.
562  bool IsKernelEntry = false;
563 
564  /// State to track what kernel entries can reach the associated function.
565  BooleanStateWithPtrSetVector<Function, false> ReachingKernelEntries;
566 
567  /// State to indicate if we can track parallel level of the associated
568  /// function. We will give up tracking if we encounter unknown caller or the
569  /// caller is __kmpc_parallel_51.
570  BooleanStateWithSetVector<uint8_t> ParallelLevels;
571 
572  /// Abstract State interface
573  ///{
574 
575  KernelInfoState() {}
576  KernelInfoState(bool BestState) {
577  if (!BestState)
578  indicatePessimisticFixpoint();
579  }
580 
581  /// See AbstractState::isValidState(...)
582  bool isValidState() const override { return true; }
583 
584  /// See AbstractState::isAtFixpoint(...)
585  bool isAtFixpoint() const override { return IsAtFixpoint; }
586 
587  /// See AbstractState::indicatePessimisticFixpoint(...)
588  ChangeStatus indicatePessimisticFixpoint() override {
589  IsAtFixpoint = true;
590  ReachingKernelEntries.indicatePessimisticFixpoint();
591  SPMDCompatibilityTracker.indicatePessimisticFixpoint();
592  ReachedKnownParallelRegions.indicatePessimisticFixpoint();
593  ReachedUnknownParallelRegions.indicatePessimisticFixpoint();
594  return ChangeStatus::CHANGED;
595  }
596 
597  /// See AbstractState::indicateOptimisticFixpoint(...)
598  ChangeStatus indicateOptimisticFixpoint() override {
599  IsAtFixpoint = true;
600  ReachingKernelEntries.indicateOptimisticFixpoint();
601  SPMDCompatibilityTracker.indicateOptimisticFixpoint();
602  ReachedKnownParallelRegions.indicateOptimisticFixpoint();
603  ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
605  }
606 
607  /// Return the assumed state
608  KernelInfoState &getAssumed() { return *this; }
609  const KernelInfoState &getAssumed() const { return *this; }
610 
611  bool operator==(const KernelInfoState &RHS) const {
612  if (SPMDCompatibilityTracker != RHS.SPMDCompatibilityTracker)
613  return false;
614  if (ReachedKnownParallelRegions != RHS.ReachedKnownParallelRegions)
615  return false;
616  if (ReachedUnknownParallelRegions != RHS.ReachedUnknownParallelRegions)
617  return false;
618  if (ReachingKernelEntries != RHS.ReachingKernelEntries)
619  return false;
620  return true;
621  }
622 
623  /// Returns true if this kernel contains any OpenMP parallel regions.
624  bool mayContainParallelRegion() {
625  return !ReachedKnownParallelRegions.empty() ||
626  !ReachedUnknownParallelRegions.empty();
627  }
628 
629  /// Return empty set as the best state of potential values.
630  static KernelInfoState getBestState() { return KernelInfoState(true); }
631 
632  static KernelInfoState getBestState(KernelInfoState &KIS) {
633  return getBestState();
634  }
635 
636  /// Return full set as the worst state of potential values.
637  static KernelInfoState getWorstState() { return KernelInfoState(false); }
638 
639  /// "Clamp" this state with \p KIS.
640  KernelInfoState operator^=(const KernelInfoState &KIS) {
641  // Do not merge two different _init and _deinit call sites.
642  if (KIS.KernelInitCB) {
643  if (KernelInitCB && KernelInitCB != KIS.KernelInitCB)
644  llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt "
645  "assumptions.");
646  KernelInitCB = KIS.KernelInitCB;
647  }
648  if (KIS.KernelDeinitCB) {
649  if (KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB)
650  llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt "
651  "assumptions.");
652  KernelDeinitCB = KIS.KernelDeinitCB;
653  }
654  SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;
655  ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;
656  ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;
657  return *this;
658  }
659 
660  KernelInfoState operator&=(const KernelInfoState &KIS) {
661  return (*this ^= KIS);
662  }
663 
664  ///}
665 };
666 
667 /// Used to map the values physically (in the IR) stored in an offload
668 /// array, to a vector in memory.
669 struct OffloadArray {
670  /// Physical array (in the IR).
671  AllocaInst *Array = nullptr;
672  /// Mapped values.
673  SmallVector<Value *, 8> StoredValues;
674  /// Last stores made in the offload array.
675  SmallVector<StoreInst *, 8> LastAccesses;
676 
677  OffloadArray() = default;
678 
679  /// Initializes the OffloadArray with the values stored in \p Array before
680  /// instruction \p Before is reached. Returns false if the initialization
681  /// fails.
682  /// This MUST be used immediately after the construction of the object.
683  bool initialize(AllocaInst &Array, Instruction &Before) {
684  if (!Array.getAllocatedType()->isArrayTy())
685  return false;
686 
687  if (!getValues(Array, Before))
688  return false;
689 
690  this->Array = &Array;
691  return true;
692  }
693 
694  static const unsigned DeviceIDArgNum = 1;
695  static const unsigned BasePtrsArgNum = 3;
696  static const unsigned PtrsArgNum = 4;
697  static const unsigned SizesArgNum = 5;
698 
699 private:
700  /// Traverses the BasicBlock where \p Array is, collecting the stores made to
701  /// \p Array, leaving StoredValues with the values stored before the
702  /// instruction \p Before is reached.
703  bool getValues(AllocaInst &Array, Instruction &Before) {
704  // Initialize container.
705  const uint64_t NumValues = Array.getAllocatedType()->getArrayNumElements();
706  StoredValues.assign(NumValues, nullptr);
707  LastAccesses.assign(NumValues, nullptr);
708 
709  // TODO: This assumes the instruction \p Before is in the same
710  // BasicBlock as Array. Make it general, for any control flow graph.
711  BasicBlock *BB = Array.getParent();
712  if (BB != Before.getParent())
713  return false;
714 
715  const DataLayout &DL = Array.getModule()->getDataLayout();
716  const unsigned int PointerSize = DL.getPointerSize();
717 
718  for (Instruction &I : *BB) {
719  if (&I == &Before)
720  break;
721 
722  if (!isa<StoreInst>(&I))
723  continue;
724 
725  auto *S = cast<StoreInst>(&I);
726  int64_t Offset = -1;
727  auto *Dst =
728  GetPointerBaseWithConstantOffset(S->getPointerOperand(), Offset, DL);
729  if (Dst == &Array) {
730  int64_t Idx = Offset / PointerSize;
731  StoredValues[Idx] = getUnderlyingObject(S->getValueOperand());
732  LastAccesses[Idx] = S;
733  }
734  }
735 
736  return isFilled();
737  }
738 
739  /// Returns true if all values in StoredValues and
740  /// LastAccesses are not nullptrs.
741  bool isFilled() {
742  const unsigned NumValues = StoredValues.size();
743  for (unsigned I = 0; I < NumValues; ++I) {
744  if (!StoredValues[I] || !LastAccesses[I])
745  return false;
746  }
747 
748  return true;
749  }
750 };
751 
752 struct OpenMPOpt {
753 
754  using OptimizationRemarkGetter =
756 
757  OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater,
758  OptimizationRemarkGetter OREGetter,
759  OMPInformationCache &OMPInfoCache, Attributor &A)
760  : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater),
761  OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {}
762 
763  /// Check if any remarks are enabled for openmp-opt
764  bool remarksEnabled() {
765  auto &Ctx = M.getContext();
766  return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(DEBUG_TYPE);
767  }
768 
769  /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice.
770  bool run(bool IsModulePass) {
771  if (SCC.empty())
772  return false;
773 
774  bool Changed = false;
775 
776  LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size()
777  << " functions in a slice with "
778  << OMPInfoCache.ModuleSlice.size() << " functions\n");
779 
780  if (IsModulePass) {
781  Changed |= runAttributor(IsModulePass);
782 
783  // Recollect uses, in case Attributor deleted any.
784  OMPInfoCache.recollectUses();
785 
786  // TODO: This should be folded into buildCustomStateMachine.
787  Changed |= rewriteDeviceCodeStateMachine();
788 
789  if (remarksEnabled())
790  analysisGlobalization();
791  } else {
792  if (PrintICVValues)
793  printICVs();
794  if (PrintOpenMPKernels)
795  printKernels();
796 
797  Changed |= runAttributor(IsModulePass);
798 
799  // Recollect uses, in case Attributor deleted any.
800  OMPInfoCache.recollectUses();
801 
802  Changed |= deleteParallelRegions();
803 
805  Changed |= hideMemTransfersLatency();
806  Changed |= deduplicateRuntimeCalls();
808  if (mergeParallelRegions()) {
809  deduplicateRuntimeCalls();
810  Changed = true;
811  }
812  }
813  }
814 
815  return Changed;
816  }
817 
818  /// Print initial ICV values for testing.
819  /// FIXME: This should be done from the Attributor once it is added.
820  void printICVs() const {
821  InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel,
822  ICV_proc_bind};
823 
824  for (Function *F : OMPInfoCache.ModuleSlice) {
825  for (auto ICV : ICVs) {
826  auto ICVInfo = OMPInfoCache.ICVs[ICV];
827  auto Remark = [&](OptimizationRemarkAnalysis ORA) {
828  return ORA << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name)
829  << " Value: "
830  << (ICVInfo.InitValue
831  ? toString(ICVInfo.InitValue->getValue(), 10, true)
832  : "IMPLEMENTATION_DEFINED");
833  };
834 
835  emitRemark<OptimizationRemarkAnalysis>(F, "OpenMPICVTracker", Remark);
836  }
837  }
838  }
839 
840  /// Print OpenMP GPU kernels for testing.
841  void printKernels() const {
842  for (Function *F : SCC) {
843  if (!OMPInfoCache.Kernels.count(F))
844  continue;
845 
846  auto Remark = [&](OptimizationRemarkAnalysis ORA) {
847  return ORA << "OpenMP GPU kernel "
848  << ore::NV("OpenMPGPUKernel", F->getName()) << "\n";
849  };
850 
851  emitRemark<OptimizationRemarkAnalysis>(F, "OpenMPGPU", Remark);
852  }
853  }
854 
855  /// Return the call if \p U is a callee use in a regular call. If \p RFI is
856  /// given it has to be the callee or a nullptr is returned.
857  static CallInst *getCallIfRegularCall(
858  Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
859  CallInst *CI = dyn_cast<CallInst>(U.getUser());
860  if (CI && CI->isCallee(&U) && !CI->hasOperandBundles() &&
861  (!RFI ||
862  (RFI->Declaration && CI->getCalledFunction() == RFI->Declaration)))
863  return CI;
864  return nullptr;
865  }
866 
867  /// Return the call if \p V is a regular call. If \p RFI is given it has to be
868  /// the callee or a nullptr is returned.
869  static CallInst *getCallIfRegularCall(
870  Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
871  CallInst *CI = dyn_cast<CallInst>(&V);
872  if (CI && !CI->hasOperandBundles() &&
873  (!RFI ||
874  (RFI->Declaration && CI->getCalledFunction() == RFI->Declaration)))
875  return CI;
876  return nullptr;
877  }
878 
879 private:
880  /// Merge parallel regions when it is safe.
881  bool mergeParallelRegions() {
882  const unsigned CallbackCalleeOperand = 2;
883  const unsigned CallbackFirstArgOperand = 3;
884  using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
885 
886  // Check if there are any __kmpc_fork_call calls to merge.
887  OMPInformationCache::RuntimeFunctionInfo &RFI =
888  OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
889 
890  if (!RFI.Declaration)
891  return false;
892 
893  // Unmergable calls that prevent merging a parallel region.
894  OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
895  OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],
896  OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],
897  };
898 
899  bool Changed = false;
900  LoopInfo *LI = nullptr;
901  DominatorTree *DT = nullptr;
902 
904 
905  BasicBlock *StartBB = nullptr, *EndBB = nullptr;
906  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
907  BasicBlock &ContinuationIP) {
908  BasicBlock *CGStartBB = CodeGenIP.getBlock();
909  BasicBlock *CGEndBB =
910  SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
911  assert(StartBB != nullptr && "StartBB should not be null");
912  CGStartBB->getTerminator()->setSuccessor(0, StartBB);
913  assert(EndBB != nullptr && "EndBB should not be null");
914  EndBB->getTerminator()->setSuccessor(0, CGEndBB);
915  };
916 
917  auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &,
918  Value &Inner, Value *&ReplacementValue) -> InsertPointTy {
919  ReplacementValue = &Inner;
920  return CodeGenIP;
921  };
922 
923  auto FiniCB = [&](InsertPointTy CodeGenIP) {};
924 
925  /// Create a sequential execution region within a merged parallel region,
926  /// encapsulated in a master construct with a barrier for synchronization.
927  auto CreateSequentialRegion = [&](Function *OuterFn,
928  BasicBlock *OuterPredBB,
929  Instruction *SeqStartI,
930  Instruction *SeqEndI) {
931  // Isolate the instructions of the sequential region to a separate
932  // block.
933  BasicBlock *ParentBB = SeqStartI->getParent();
934  BasicBlock *SeqEndBB =
935  SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);
936  BasicBlock *SeqAfterBB =
937  SplitBlock(SeqEndBB, &*SeqEndBB->getFirstInsertionPt(), DT, LI);
938  BasicBlock *SeqStartBB =
939  SplitBlock(ParentBB, SeqStartI, DT, LI, nullptr, "seq.par.merged");
940 
941  assert(ParentBB->getUniqueSuccessor() == SeqStartBB &&
942  "Expected a different CFG");
943  const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
944  ParentBB->getTerminator()->eraseFromParent();
945 
946  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
947  BasicBlock &ContinuationIP) {
948  BasicBlock *CGStartBB = CodeGenIP.getBlock();
949  BasicBlock *CGEndBB =
950  SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
951  assert(SeqStartBB != nullptr && "SeqStartBB should not be null");
952  CGStartBB->getTerminator()->setSuccessor(0, SeqStartBB);
953  assert(SeqEndBB != nullptr && "SeqEndBB should not be null");
954  SeqEndBB->getTerminator()->setSuccessor(0, CGEndBB);
955  };
956  auto FiniCB = [&](InsertPointTy CodeGenIP) {};
957 
958  // Find outputs from the sequential region to outside users and
959  // broadcast their values to them.
960  for (Instruction &I : *SeqStartBB) {
961  SmallPtrSet<Instruction *, 4> OutsideUsers;
962  for (User *Usr : I.users()) {
963  Instruction &UsrI = *cast<Instruction>(Usr);
964  // Ignore outputs to LT intrinsics, code extraction for the merged
965  // parallel region will fix them.
966  if (UsrI.isLifetimeStartOrEnd())
967  continue;
968 
969  if (UsrI.getParent() != SeqStartBB)
970  OutsideUsers.insert(&UsrI);
971  }
972 
973  if (OutsideUsers.empty())
974  continue;
975 
976  // Emit an alloca in the outer region to store the broadcasted
977  // value.
978  const DataLayout &DL = M.getDataLayout();
979  AllocaInst *AllocaI = new AllocaInst(
980  I.getType(), DL.getAllocaAddrSpace(), nullptr,
981  I.getName() + ".seq.output.alloc", &OuterFn->front().front());
982 
983  // Emit a store instruction in the sequential BB to update the
984  // value.
985  new StoreInst(&I, AllocaI, SeqStartBB->getTerminator());
986 
987  // Emit a load instruction and replace the use of the output value
988  // with it.
989  for (Instruction *UsrI : OutsideUsers) {
990  LoadInst *LoadI = new LoadInst(
991  I.getType(), AllocaI, I.getName() + ".seq.output.load", UsrI);
992  UsrI->replaceUsesOfWith(&I, LoadI);
993  }
994  }
995 
997  InsertPointTy(ParentBB, ParentBB->end()), DL);
998  InsertPointTy SeqAfterIP =
999  OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB);
1000 
1001  OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel);
1002 
1003  BranchInst::Create(SeqAfterBB, SeqAfterIP.getBlock());
1004 
1005  LLVM_DEBUG(dbgs() << TAG << "After sequential inlining " << *OuterFn
1006  << "\n");
1007  };
1008 
1009  // Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all
1010  // contained in BB and only separated by instructions that can be
1011  // redundantly executed in parallel. The block BB is split before the first
1012  // call (in MergableCIs) and after the last so the entire region we merge
1013  // into a single parallel region is contained in a single basic block
1014  // without any other instructions. We use the OpenMPIRBuilder to outline
1015  // that block and call the resulting function via __kmpc_fork_call.
1016  auto Merge = [&](SmallVectorImpl<CallInst *> &MergableCIs, BasicBlock *BB) {
1017  // TODO: Change the interface to allow single CIs expanded, e.g, to
1018  // include an outer loop.
1019  assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs");
1020 
1021  auto Remark = [&](OptimizationRemark OR) {
1022  OR << "Parallel region merged with parallel region"
1023  << (MergableCIs.size() > 2 ? "s" : "") << " at ";
1024  for (auto *CI : llvm::drop_begin(MergableCIs)) {
1025  OR << ore::NV("OpenMPParallelMerge", CI->getDebugLoc());
1026  if (CI != MergableCIs.back())
1027  OR << ", ";
1028  }
1029  return OR << ".";
1030  };
1031 
1032  emitRemark<OptimizationRemark>(MergableCIs.front(), "OMP150", Remark);
1033 
1034  Function *OriginalFn = BB->getParent();
1035  LLVM_DEBUG(dbgs() << TAG << "Merge " << MergableCIs.size()
1036  << " parallel regions in " << OriginalFn->getName()
1037  << "\n");
1038 
1039  // Isolate the calls to merge in a separate block.
1040  EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI);
1041  BasicBlock *AfterBB =
1042  SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
1043  StartBB = SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr,
1044  "omp.par.merged");
1045 
1046  assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG");
1047  const DebugLoc DL = BB->getTerminator()->getDebugLoc();
1048  BB->getTerminator()->eraseFromParent();
1049 
1050  // Create sequential regions for sequential instructions that are
1051  // in-between mergable parallel regions.
1052  for (auto *It = MergableCIs.begin(), *End = MergableCIs.end() - 1;
1053  It != End; ++It) {
1054  Instruction *ForkCI = *It;
1055  Instruction *NextForkCI = *(It + 1);
1056 
1057  // Continue if there are not in-between instructions.
1058  if (ForkCI->getNextNode() == NextForkCI)
1059  continue;
1060 
1061  CreateSequentialRegion(OriginalFn, BB, ForkCI->getNextNode(),
1062  NextForkCI->getPrevNode());
1063  }
1064 
1065  OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),
1066  DL);
1067  IRBuilder<>::InsertPoint AllocaIP(
1068  &OriginalFn->getEntryBlock(),
1069  OriginalFn->getEntryBlock().getFirstInsertionPt());
1070  // Create the merged parallel region with default proc binding, to
1071  // avoid overriding binding settings, and without explicit cancellation.
1072  InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.createParallel(
1073  Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr,
1074  OMP_PROC_BIND_default, /* IsCancellable */ false);
1075  BranchInst::Create(AfterBB, AfterIP.getBlock());
1076 
1077  // Perform the actual outlining.
1078  OMPInfoCache.OMPBuilder.finalize(OriginalFn,
1079  /* AllowExtractorSinking */ true);
1080 
1081  Function *OutlinedFn = MergableCIs.front()->getCaller();
1082 
1083  // Replace the __kmpc_fork_call calls with direct calls to the outlined
1084  // callbacks.
1086  for (auto *CI : MergableCIs) {
1087  Value *Callee =
1088  CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts();
1089  FunctionType *FT =
1090  cast<FunctionType>(Callee->getType()->getPointerElementType());
1091  Args.clear();
1092  Args.push_back(OutlinedFn->getArg(0));
1093  Args.push_back(OutlinedFn->getArg(1));
1094  for (unsigned U = CallbackFirstArgOperand, E = CI->arg_size(); U < E;
1095  ++U)
1096  Args.push_back(CI->getArgOperand(U));
1097 
1098  CallInst *NewCI = CallInst::Create(FT, Callee, Args, "", CI);
1099  if (CI->getDebugLoc())
1100  NewCI->setDebugLoc(CI->getDebugLoc());
1101 
1102  // Forward parameter attributes from the callback to the callee.
1103  for (unsigned U = CallbackFirstArgOperand, E = CI->arg_size(); U < E;
1104  ++U)
1105  for (const Attribute &A : CI->getAttributes().getParamAttrs(U))
1106  NewCI->addParamAttr(
1107  U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
1108 
1109  // Emit an explicit barrier to replace the implicit fork-join barrier.
1110  if (CI != MergableCIs.back()) {
1111  // TODO: Remove barrier if the merged parallel region includes the
1112  // 'nowait' clause.
1113  OMPInfoCache.OMPBuilder.createBarrier(
1114  InsertPointTy(NewCI->getParent(),
1115  NewCI->getNextNode()->getIterator()),
1116  OMPD_parallel);
1117  }
1118 
1119  CI->eraseFromParent();
1120  }
1121 
1122  assert(OutlinedFn != OriginalFn && "Outlining failed");
1123  CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn);
1124  CGUpdater.reanalyzeFunction(*OriginalFn);
1125 
1126  NumOpenMPParallelRegionsMerged += MergableCIs.size();
1127 
1128  return true;
1129  };
1130 
1131  // Helper function that identifes sequences of
1132  // __kmpc_fork_call uses in a basic block.
1133  auto DetectPRsCB = [&](Use &U, Function &F) {
1134  CallInst *CI = getCallIfRegularCall(U, &RFI);
1135  BB2PRMap[CI->getParent()].insert(CI);
1136 
1137  return false;
1138  };
1139 
1140  BB2PRMap.clear();
1141  RFI.foreachUse(SCC, DetectPRsCB);
1142  SmallVector<SmallVector<CallInst *, 4>, 4> MergableCIsVector;
1143  // Find mergable parallel regions within a basic block that are
1144  // safe to merge, that is any in-between instructions can safely
1145  // execute in parallel after merging.
1146  // TODO: support merging across basic-blocks.
1147  for (auto &It : BB2PRMap) {
1148  auto &CIs = It.getSecond();
1149  if (CIs.size() < 2)
1150  continue;
1151 
1152  BasicBlock *BB = It.getFirst();
1153  SmallVector<CallInst *, 4> MergableCIs;
1154 
1155  /// Returns true if the instruction is mergable, false otherwise.
1156  /// A terminator instruction is unmergable by definition since merging
1157  /// works within a BB. Instructions before the mergable region are
1158  /// mergable if they are not calls to OpenMP runtime functions that may
1159  /// set different execution parameters for subsequent parallel regions.
1160  /// Instructions in-between parallel regions are mergable if they are not
1161  /// calls to any non-intrinsic function since that may call a non-mergable
1162  /// OpenMP runtime function.
1163  auto IsMergable = [&](Instruction &I, bool IsBeforeMergableRegion) {
1164  // We do not merge across BBs, hence return false (unmergable) if the
1165  // instruction is a terminator.
1166  if (I.isTerminator())
1167  return false;
1168 
1169  if (!isa<CallInst>(&I))
1170  return true;
1171 
1172  CallInst *CI = cast<CallInst>(&I);
1173  if (IsBeforeMergableRegion) {
1174  Function *CalledFunction = CI->getCalledFunction();
1175  if (!CalledFunction)
1176  return false;
1177  // Return false (unmergable) if the call before the parallel
1178  // region calls an explicit affinity (proc_bind) or number of
1179  // threads (num_threads) compiler-generated function. Those settings
1180  // may be incompatible with following parallel regions.
1181  // TODO: ICV tracking to detect compatibility.
1182  for (const auto &RFI : UnmergableCallsInfo) {
1183  if (CalledFunction == RFI.Declaration)
1184  return false;
1185  }
1186  } else {
1187  // Return false (unmergable) if there is a call instruction
1188  // in-between parallel regions when it is not an intrinsic. It
1189  // may call an unmergable OpenMP runtime function in its callpath.
1190  // TODO: Keep track of possible OpenMP calls in the callpath.
1191  if (!isa<IntrinsicInst>(CI))
1192  return false;
1193  }
1194 
1195  return true;
1196  };
1197  // Find maximal number of parallel region CIs that are safe to merge.
1198  for (auto It = BB->begin(), End = BB->end(); It != End;) {
1199  Instruction &I = *It;
1200  ++It;
1201 
1202  if (CIs.count(&I)) {
1203  MergableCIs.push_back(cast<CallInst>(&I));
1204  continue;
1205  }
1206 
1207  // Continue expanding if the instruction is mergable.
1208  if (IsMergable(I, MergableCIs.empty()))
1209  continue;
1210 
1211  // Forward the instruction iterator to skip the next parallel region
1212  // since there is an unmergable instruction which can affect it.
1213  for (; It != End; ++It) {
1214  Instruction &SkipI = *It;
1215  if (CIs.count(&SkipI)) {
1216  LLVM_DEBUG(dbgs() << TAG << "Skip parallel region " << SkipI
1217  << " due to " << I << "\n");
1218  ++It;
1219  break;
1220  }
1221  }
1222 
1223  // Store mergable regions found.
1224  if (MergableCIs.size() > 1) {
1225  MergableCIsVector.push_back(MergableCIs);
1226  LLVM_DEBUG(dbgs() << TAG << "Found " << MergableCIs.size()
1227  << " parallel regions in block " << BB->getName()
1228  << " of function " << BB->getParent()->getName()
1229  << "\n";);
1230  }
1231 
1232  MergableCIs.clear();
1233  }
1234 
1235  if (!MergableCIsVector.empty()) {
1236  Changed = true;
1237 
1238  for (auto &MergableCIs : MergableCIsVector)
1239  Merge(MergableCIs, BB);
1240  MergableCIsVector.clear();
1241  }
1242  }
1243 
1244  if (Changed) {
1245  /// Re-collect use for fork calls, emitted barrier calls, and
1246  /// any emitted master/end_master calls.
1247  OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call);
1248  OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier);
1249  OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master);
1250  OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master);
1251  }
1252 
1253  return Changed;
1254  }
1255 
1256  /// Try to delete parallel regions if possible.
1257  bool deleteParallelRegions() {
1258  const unsigned CallbackCalleeOperand = 2;
1259 
1260  OMPInformationCache::RuntimeFunctionInfo &RFI =
1261  OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1262 
1263  if (!RFI.Declaration)
1264  return false;
1265 
1266  bool Changed = false;
1267  auto DeleteCallCB = [&](Use &U, Function &) {
1268  CallInst *CI = getCallIfRegularCall(U);
1269  if (!CI)
1270  return false;
1271  auto *Fn = dyn_cast<Function>(
1272  CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts());
1273  if (!Fn)
1274  return false;
1275  if (!Fn->onlyReadsMemory())
1276  return false;
1277  if (!Fn->hasFnAttribute(Attribute::WillReturn))
1278  return false;
1279 
1280  LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in "
1281  << CI->getCaller()->getName() << "\n");
1282 
1283  auto Remark = [&](OptimizationRemark OR) {
1284  return OR << "Removing parallel region with no side-effects.";
1285  };
1286  emitRemark<OptimizationRemark>(CI, "OMP160", Remark);
1287 
1288  CGUpdater.removeCallSite(*CI);
1289  CI->eraseFromParent();
1290  Changed = true;
1291  ++NumOpenMPParallelRegionsDeleted;
1292  return true;
1293  };
1294 
1295  RFI.foreachUse(SCC, DeleteCallCB);
1296 
1297  return Changed;
1298  }
1299 
1300  /// Try to eliminate runtime calls by reusing existing ones.
1301  bool deduplicateRuntimeCalls() {
1302  bool Changed = false;
1303 
1304  RuntimeFunction DeduplicableRuntimeCallIDs[] = {
1305  OMPRTL_omp_get_num_threads,
1306  OMPRTL_omp_in_parallel,
1307  OMPRTL_omp_get_cancellation,
1308  OMPRTL_omp_get_thread_limit,
1309  OMPRTL_omp_get_supported_active_levels,
1310  OMPRTL_omp_get_level,
1311  OMPRTL_omp_get_ancestor_thread_num,
1312  OMPRTL_omp_get_team_size,
1313  OMPRTL_omp_get_active_level,
1314  OMPRTL_omp_in_final,
1315  OMPRTL_omp_get_proc_bind,
1316  OMPRTL_omp_get_num_places,
1317  OMPRTL_omp_get_num_procs,
1318  OMPRTL_omp_get_place_num,
1319  OMPRTL_omp_get_partition_num_places,
1320  OMPRTL_omp_get_partition_place_nums};
1321 
1322  // Global-tid is handled separately.
1323  SmallSetVector<Value *, 16> GTIdArgs;
1324  collectGlobalThreadIdArguments(GTIdArgs);
1325  LLVM_DEBUG(dbgs() << TAG << "Found " << GTIdArgs.size()
1326  << " global thread ID arguments\n");
1327 
1328  for (Function *F : SCC) {
1329  for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
1330  Changed |= deduplicateRuntimeCalls(
1331  *F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
1332 
1333  // __kmpc_global_thread_num is special as we can replace it with an
1334  // argument in enough cases to make it worth trying.
1335  Value *GTIdArg = nullptr;
1336  for (Argument &Arg : F->args())
1337  if (GTIdArgs.count(&Arg)) {
1338  GTIdArg = &Arg;
1339  break;
1340  }
1341  Changed |= deduplicateRuntimeCalls(
1342  *F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);
1343  }
1344 
1345  return Changed;
1346  }
1347 
1348  /// Tries to hide the latency of runtime calls that involve host to
1349  /// device memory transfers by splitting them into their "issue" and "wait"
1350  /// versions. The "issue" is moved upwards as much as possible. The "wait" is
1351  /// moved downards as much as possible. The "issue" issues the memory transfer
1352  /// asynchronously, returning a handle. The "wait" waits in the returned
1353  /// handle for the memory transfer to finish.
1354  bool hideMemTransfersLatency() {
1355  auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
1356  bool Changed = false;
1357  auto SplitMemTransfers = [&](Use &U, Function &Decl) {
1358  auto *RTCall = getCallIfRegularCall(U, &RFI);
1359  if (!RTCall)
1360  return false;
1361 
1362  OffloadArray OffloadArrays[3];
1363  if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
1364  return false;
1365 
1366  LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));
1367 
1368  // TODO: Check if can be moved upwards.
1369  bool WasSplit = false;
1370  Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
1371  if (WaitMovementPoint)
1372  WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);
1373 
1374  Changed |= WasSplit;
1375  return WasSplit;
1376  };
1377  RFI.foreachUse(SCC, SplitMemTransfers);
1378 
1379  return Changed;
1380  }
1381 
1382  void analysisGlobalization() {
1383  auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
1384 
1385  auto CheckGlobalization = [&](Use &U, Function &Decl) {
1386  if (CallInst *CI = getCallIfRegularCall(U, &RFI)) {
1387  auto Remark = [&](OptimizationRemarkMissed ORM) {
1388  return ORM
1389  << "Found thread data sharing on the GPU. "
1390  << "Expect degraded performance due to data globalization.";
1391  };
1392  emitRemark<OptimizationRemarkMissed>(CI, "OMP112", Remark);
1393  }
1394 
1395  return false;
1396  };
1397 
1398  RFI.foreachUse(SCC, CheckGlobalization);
1399  }
1400 
1401  /// Maps the values stored in the offload arrays passed as arguments to
1402  /// \p RuntimeCall into the offload arrays in \p OAs.
1403  bool getValuesInOffloadArrays(CallInst &RuntimeCall,
1405  assert(OAs.size() == 3 && "Need space for three offload arrays!");
1406 
1407  // A runtime call that involves memory offloading looks something like:
1408  // call void @__tgt_target_data_begin_mapper(arg0, arg1,
1409  // i8** %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes,
1410  // ...)
1411  // So, the idea is to access the allocas that allocate space for these
1412  // offload arrays, offload_baseptrs, offload_ptrs, offload_sizes.
1413  // Therefore:
1414  // i8** %offload_baseptrs.
1415  Value *BasePtrsArg =
1416  RuntimeCall.getArgOperand(OffloadArray::BasePtrsArgNum);
1417  // i8** %offload_ptrs.
1418  Value *PtrsArg = RuntimeCall.getArgOperand(OffloadArray::PtrsArgNum);
1419  // i8** %offload_sizes.
1420  Value *SizesArg = RuntimeCall.getArgOperand(OffloadArray::SizesArgNum);
1421 
1422  // Get values stored in **offload_baseptrs.
1423  auto *V = getUnderlyingObject(BasePtrsArg);
1424  if (!isa<AllocaInst>(V))
1425  return false;
1426  auto *BasePtrsArray = cast<AllocaInst>(V);
1427  if (!OAs[0].initialize(*BasePtrsArray, RuntimeCall))
1428  return false;
1429 
1430  // Get values stored in **offload_baseptrs.
1431  V = getUnderlyingObject(PtrsArg);
1432  if (!isa<AllocaInst>(V))
1433  return false;
1434  auto *PtrsArray = cast<AllocaInst>(V);
1435  if (!OAs[1].initialize(*PtrsArray, RuntimeCall))
1436  return false;
1437 
1438  // Get values stored in **offload_sizes.
1439  V = getUnderlyingObject(SizesArg);
1440  // If it's a [constant] global array don't analyze it.
1441  if (isa<GlobalValue>(V))
1442  return isa<Constant>(V);
1443  if (!isa<AllocaInst>(V))
1444  return false;
1445 
1446  auto *SizesArray = cast<AllocaInst>(V);
1447  if (!OAs[2].initialize(*SizesArray, RuntimeCall))
1448  return false;
1449 
1450  return true;
1451  }
1452 
1453  /// Prints the values in the OffloadArrays \p OAs using LLVM_DEBUG.
1454  /// For now this is a way to test that the function getValuesInOffloadArrays
1455  /// is working properly.
1456  /// TODO: Move this to a unittest when unittests are available for OpenMPOpt.
1457  void dumpValuesInOffloadArrays(ArrayRef<OffloadArray> OAs) {
1458  assert(OAs.size() == 3 && "There are three offload arrays to debug!");
1459 
1460  LLVM_DEBUG(dbgs() << TAG << " Successfully got offload values:\n");
1461  std::string ValuesStr;
1462  raw_string_ostream Printer(ValuesStr);
1463  std::string Separator = " --- ";
1464 
1465  for (auto *BP : OAs[0].StoredValues) {
1466  BP->print(Printer);
1467  Printer << Separator;
1468  }
1469  LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << Printer.str() << "\n");
1470  ValuesStr.clear();
1471 
1472  for (auto *P : OAs[1].StoredValues) {
1473  P->print(Printer);
1474  Printer << Separator;
1475  }
1476  LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << Printer.str() << "\n");
1477  ValuesStr.clear();
1478 
1479  for (auto *S : OAs[2].StoredValues) {
1480  S->print(Printer);
1481  Printer << Separator;
1482  }
1483  LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n");
1484  }
1485 
1486  /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be
1487  /// moved. Returns nullptr if the movement is not possible, or not worth it.
1488  Instruction *canBeMovedDownwards(CallInst &RuntimeCall) {
1489  // FIXME: This traverses only the BasicBlock where RuntimeCall is.
1490  // Make it traverse the CFG.
1491 
1492  Instruction *CurrentI = &RuntimeCall;
1493  bool IsWorthIt = false;
1494  while ((CurrentI = CurrentI->getNextNode())) {
1495 
1496  // TODO: Once we detect the regions to be offloaded we should use the
1497  // alias analysis manager to check if CurrentI may modify one of
1498  // the offloaded regions.
1499  if (CurrentI->mayHaveSideEffects() || CurrentI->mayReadFromMemory()) {
1500  if (IsWorthIt)
1501  return CurrentI;
1502 
1503  return nullptr;
1504  }
1505 
1506  // FIXME: For now if we move it over anything without side effect
1507  // is worth it.
1508  IsWorthIt = true;
1509  }
1510 
1511  // Return end of BasicBlock.
1512  return RuntimeCall.getParent()->getTerminator();
1513  }
1514 
1515  /// Splits \p RuntimeCall into its "issue" and "wait" counterparts.
1516  bool splitTargetDataBeginRTC(CallInst &RuntimeCall,
1517  Instruction &WaitMovementPoint) {
1518  // Create stack allocated handle (__tgt_async_info) at the beginning of the
1519  // function. Used for storing information of the async transfer, allowing to
1520  // wait on it later.
1521  auto &IRBuilder = OMPInfoCache.OMPBuilder;
1522  auto *F = RuntimeCall.getCaller();
1523  Instruction *FirstInst = &(F->getEntryBlock().front());
1524  AllocaInst *Handle = new AllocaInst(
1525  IRBuilder.AsyncInfo, F->getAddressSpace(), "handle", FirstInst);
1526 
1527  // Add "issue" runtime call declaration:
1528  // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32,
1529  // i8**, i8**, i64*, i64*)
1530  FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction(
1531  M, OMPRTL___tgt_target_data_begin_mapper_issue);
1532 
1533  // Change RuntimeCall call site for its asynchronous version.
1535  for (auto &Arg : RuntimeCall.args())
1536  Args.push_back(Arg.get());
1537  Args.push_back(Handle);
1538 
1539  CallInst *IssueCallsite =
1540  CallInst::Create(IssueDecl, Args, /*NameStr=*/"", &RuntimeCall);
1541  RuntimeCall.eraseFromParent();
1542 
1543  // Add "wait" runtime call declaration:
1544  // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info)
1545  FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction(
1546  M, OMPRTL___tgt_target_data_begin_mapper_wait);
1547 
1548  Value *WaitParams[2] = {
1549  IssueCallsite->getArgOperand(
1550  OffloadArray::DeviceIDArgNum), // device_id.
1551  Handle // handle to wait on.
1552  };
1553  CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint);
1554 
1555  return true;
1556  }
1557 
1558  static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent,
1559  bool GlobalOnly, bool &SingleChoice) {
1560  if (CurrentIdent == NextIdent)
1561  return CurrentIdent;
1562 
1563  // TODO: Figure out how to actually combine multiple debug locations. For
1564  // now we just keep an existing one if there is a single choice.
1565  if (!GlobalOnly || isa<GlobalValue>(NextIdent)) {
1566  SingleChoice = !CurrentIdent;
1567  return NextIdent;
1568  }
1569  return nullptr;
1570  }
1571 
1572  /// Return an `struct ident_t*` value that represents the ones used in the
1573  /// calls of \p RFI inside of \p F. If \p GlobalOnly is true, we will not
1574  /// return a local `struct ident_t*`. For now, if we cannot find a suitable
1575  /// return value we create one from scratch. We also do not yet combine
1576  /// information, e.g., the source locations, see combinedIdentStruct.
1577  Value *
1578  getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
1579  Function &F, bool GlobalOnly) {
1580  bool SingleChoice = true;
1581  Value *Ident = nullptr;
1582  auto CombineIdentStruct = [&](Use &U, Function &Caller) {
1583  CallInst *CI = getCallIfRegularCall(U, &RFI);
1584  if (!CI || &F != &Caller)
1585  return false;
1586  Ident = combinedIdentStruct(Ident, CI->getArgOperand(0),
1587  /* GlobalOnly */ true, SingleChoice);
1588  return false;
1589  };
1590  RFI.foreachUse(SCC, CombineIdentStruct);
1591 
1592  if (!Ident || !SingleChoice) {
1593  // The IRBuilder uses the insertion block to get to the module, this is
1594  // unfortunate but we work around it for now.
1595  if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
1596  OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy(
1597  &F.getEntryBlock(), F.getEntryBlock().begin()));
1598  // Create a fallback location if non was found.
1599  // TODO: Use the debug locations of the calls instead.
1600  Constant *Loc = OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr();
1601  Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc);
1602  }
1603  return Ident;
1604  }
1605 
1606  /// Try to eliminate calls of \p RFI in \p F by reusing an existing one or
1607  /// \p ReplVal if given.
1608  bool deduplicateRuntimeCalls(Function &F,
1609  OMPInformationCache::RuntimeFunctionInfo &RFI,
1610  Value *ReplVal = nullptr) {
1611  auto *UV = RFI.getUseVector(F);
1612  if (!UV || UV->size() + (ReplVal != nullptr) < 2)
1613  return false;
1614 
1615  LLVM_DEBUG(
1616  dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Name
1617  << (ReplVal ? " with an existing value\n" : "\n") << "\n");
1618 
1619  assert((!ReplVal || (isa<Argument>(ReplVal) &&
1620  cast<Argument>(ReplVal)->getParent() == &F)) &&
1621  "Unexpected replacement value!");
1622 
1623  // TODO: Use dominance to find a good position instead.
1624  auto CanBeMoved = [this](CallBase &CB) {
1625  unsigned NumArgs = CB.arg_size();
1626  if (NumArgs == 0)
1627  return true;
1628  if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
1629  return false;
1630  for (unsigned U = 1; U < NumArgs; ++U)
1631  if (isa<Instruction>(CB.getArgOperand(U)))
1632  return false;
1633  return true;
1634  };
1635 
1636  if (!ReplVal) {
1637  for (Use *U : *UV)
1638  if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
1639  if (!CanBeMoved(*CI))
1640  continue;
1641 
1642  // If the function is a kernel, dedup will move
1643  // the runtime call right after the kernel init callsite. Otherwise,
1644  // it will move it to the beginning of the caller function.
1645  if (isKernel(F)) {
1646  auto &KernelInitRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
1647  auto *KernelInitUV = KernelInitRFI.getUseVector(F);
1648 
1649  if (KernelInitUV->empty())
1650  continue;
1651 
1652  assert(KernelInitUV->size() == 1 &&
1653  "Expected a single __kmpc_target_init in kernel\n");
1654 
1655  CallInst *KernelInitCI =
1656  getCallIfRegularCall(*KernelInitUV->front(), &KernelInitRFI);
1657  assert(KernelInitCI &&
1658  "Expected a call to __kmpc_target_init in kernel\n");
1659 
1660  CI->moveAfter(KernelInitCI);
1661  } else
1662  CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt());
1663  ReplVal = CI;
1664  break;
1665  }
1666  if (!ReplVal)
1667  return false;
1668  }
1669 
1670  // If we use a call as a replacement value we need to make sure the ident is
1671  // valid at the new location. For now we just pick a global one, either
1672  // existing and used by one of the calls, or created from scratch.
1673  if (CallBase *CI = dyn_cast<CallBase>(ReplVal)) {
1674  if (!CI->arg_empty() &&
1675  CI->getArgOperand(0)->getType() == OMPInfoCache.OMPBuilder.IdentPtr) {
1676  Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F,
1677  /* GlobalOnly */ true);
1678  CI->setArgOperand(0, Ident);
1679  }
1680  }
1681 
1682  bool Changed = false;
1683  auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) {
1684  CallInst *CI = getCallIfRegularCall(U, &RFI);
1685  if (!CI || CI == ReplVal || &F != &Caller)
1686  return false;
1687  assert(CI->getCaller() == &F && "Unexpected call!");
1688 
1689  auto Remark = [&](OptimizationRemark OR) {
1690  return OR << "OpenMP runtime call "
1691  << ore::NV("OpenMPOptRuntime", RFI.Name) << " deduplicated.";
1692  };
1693  if (CI->getDebugLoc())
1694  emitRemark<OptimizationRemark>(CI, "OMP170", Remark);
1695  else
1696  emitRemark<OptimizationRemark>(&F, "OMP170", Remark);
1697 
1698  CGUpdater.removeCallSite(*CI);
1699  CI->replaceAllUsesWith(ReplVal);
1700  CI->eraseFromParent();
1701  ++NumOpenMPRuntimeCallsDeduplicated;
1702  Changed = true;
1703  return true;
1704  };
1705  RFI.foreachUse(SCC, ReplaceAndDeleteCB);
1706 
1707  return Changed;
1708  }
1709 
1710  /// Collect arguments that represent the global thread id in \p GTIdArgs.
1711  void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> &GTIdArgs) {
1712  // TODO: Below we basically perform a fixpoint iteration with a pessimistic
1713  // initialization. We could define an AbstractAttribute instead and
1714  // run the Attributor here once it can be run as an SCC pass.
1715 
1716  // Helper to check the argument \p ArgNo at all call sites of \p F for
1717  // a GTId.
1718  auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) {
1719  if (!F.hasLocalLinkage())
1720  return false;
1721  for (Use &U : F.uses()) {
1722  if (CallInst *CI = getCallIfRegularCall(U)) {
1723  Value *ArgOp = CI->getArgOperand(ArgNo);
1724  if (CI == &RefCI || GTIdArgs.count(ArgOp) ||
1725  getCallIfRegularCall(
1726  *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))
1727  continue;
1728  }
1729  return false;
1730  }
1731  return true;
1732  };
1733 
1734  // Helper to identify uses of a GTId as GTId arguments.
1735  auto AddUserArgs = [&](Value &GTId) {
1736  for (Use &U : GTId.uses())
1737  if (CallInst *CI = dyn_cast<CallInst>(U.getUser()))
1738  if (CI->isArgOperand(&U))
1739  if (Function *Callee = CI->getCalledFunction())
1740  if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI))
1741  GTIdArgs.insert(Callee->getArg(U.getOperandNo()));
1742  };
1743 
1744  // The argument users of __kmpc_global_thread_num calls are GTIds.
1745  OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
1746  OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];
1747 
1748  GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &F) {
1749  if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))
1750  AddUserArgs(*CI);
1751  return false;
1752  });
1753 
1754  // Transitively search for more arguments by looking at the users of the
1755  // ones we know already. During the search the GTIdArgs vector is extended
1756  // so we cannot cache the size nor can we use a range based for.
1757  for (unsigned U = 0; U < GTIdArgs.size(); ++U)
1758  AddUserArgs(*GTIdArgs[U]);
1759  }
1760 
1761  /// Kernel (=GPU) optimizations and utility functions
1762  ///
1763  ///{{
1764 
1765  /// Check if \p F is a kernel, hence entry point for target offloading.
1766  bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); }
1767 
1768  /// Cache to remember the unique kernel for a function.
1769  DenseMap<Function *, Optional<Kernel>> UniqueKernelMap;
1770 
1771  /// Find the unique kernel that will execute \p F, if any.
1772  Kernel getUniqueKernelFor(Function &F);
1773 
1774  /// Find the unique kernel that will execute \p I, if any.
1775  Kernel getUniqueKernelFor(Instruction &I) {
1776  return getUniqueKernelFor(*I.getFunction());
1777  }
1778 
1779  /// Rewrite the device (=GPU) code state machine create in non-SPMD mode in
1780  /// the cases we can avoid taking the address of a function.
1781  bool rewriteDeviceCodeStateMachine();
1782 
1783  ///
1784  ///}}
1785 
1786  /// Emit a remark generically
1787  ///
1788  /// This template function can be used to generically emit a remark. The
1789  /// RemarkKind should be one of the following:
1790  /// - OptimizationRemark to indicate a successful optimization attempt
1791  /// - OptimizationRemarkMissed to report a failed optimization attempt
1792  /// - OptimizationRemarkAnalysis to provide additional information about an
1793  /// optimization attempt
1794  ///
1795  /// The remark is built using a callback function provided by the caller that
1796  /// takes a RemarkKind as input and returns a RemarkKind.
1797  template <typename RemarkKind, typename RemarkCallBack>
1798  void emitRemark(Instruction *I, StringRef RemarkName,
1799  RemarkCallBack &&RemarkCB) const {
1800  Function *F = I->getParent()->getParent();
1801  auto &ORE = OREGetter(F);
1802 
1803  if (RemarkName.startswith("OMP"))
1804  ORE.emit([&]() {
1805  return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, I))
1806  << " [" << RemarkName << "]";
1807  });
1808  else
1809  ORE.emit(
1810  [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, I)); });
1811  }
1812 
1813  /// Emit a remark on a function.
1814  template <typename RemarkKind, typename RemarkCallBack>
1815  void emitRemark(Function *F, StringRef RemarkName,
1816  RemarkCallBack &&RemarkCB) const {
1817  auto &ORE = OREGetter(F);
1818 
1819  if (RemarkName.startswith("OMP"))
1820  ORE.emit([&]() {
1821  return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, F))
1822  << " [" << RemarkName << "]";
1823  });
1824  else
1825  ORE.emit(
1826  [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, F)); });
1827  }
1828 
1829  /// RAII struct to temporarily change an RTL function's linkage to external.
1830  /// This prevents it from being mistakenly removed by other optimizations.
1831  struct ExternalizationRAII {
1832  ExternalizationRAII(OMPInformationCache &OMPInfoCache,
1833  RuntimeFunction RFKind)
1834  : Declaration(OMPInfoCache.RFIs[RFKind].Declaration) {
1835  if (!Declaration)
1836  return;
1837 
1838  LinkageType = Declaration->getLinkage();
1839  Declaration->setLinkage(GlobalValue::ExternalLinkage);
1840  }
1841 
1842  ~ExternalizationRAII() {
1843  if (!Declaration)
1844  return;
1845 
1846  Declaration->setLinkage(LinkageType);
1847  }
1848 
1849  Function *Declaration;
1850  GlobalValue::LinkageTypes LinkageType;
1851  };
1852 
1853  /// The underlying module.
1854  Module &M;
1855 
1856  /// The SCC we are operating on.
1858 
1859  /// Callback to update the call graph, the first argument is a removed call,
1860  /// the second an optional replacement call.
1861  CallGraphUpdater &CGUpdater;
1862 
1863  /// Callback to get an OptimizationRemarkEmitter from a Function *
1864  OptimizationRemarkGetter OREGetter;
1865 
1866  /// OpenMP-specific information cache. Also Used for Attributor runs.
1867  OMPInformationCache &OMPInfoCache;
1868 
1869  /// Attributor instance.
1870  Attributor &A;
1871 
1872  /// Helper function to run Attributor on SCC.
1873  bool runAttributor(bool IsModulePass) {
1874  if (SCC.empty())
1875  return false;
1876 
1877  // Temporarily make these function have external linkage so the Attributor
1878  // doesn't remove them when we try to look them up later.
1879  ExternalizationRAII Parallel(OMPInfoCache, OMPRTL___kmpc_kernel_parallel);
1880  ExternalizationRAII EndParallel(OMPInfoCache,
1881  OMPRTL___kmpc_kernel_end_parallel);
1882  ExternalizationRAII BarrierSPMD(OMPInfoCache,
1883  OMPRTL___kmpc_barrier_simple_spmd);
1884  ExternalizationRAII BarrierGeneric(OMPInfoCache,
1885  OMPRTL___kmpc_barrier_simple_generic);
1886  ExternalizationRAII ThreadId(OMPInfoCache,
1887  OMPRTL___kmpc_get_hardware_thread_id_in_block);
1888 
1889  registerAAs(IsModulePass);
1890 
1891  ChangeStatus Changed = A.run();
1892 
1893  LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size()
1894  << " functions, result: " << Changed << ".\n");
1895 
1896  return Changed == ChangeStatus::CHANGED;
1897  }
1898 
1899  void registerFoldRuntimeCall(RuntimeFunction RF);
1900 
1901  /// Populate the Attributor with abstract attribute opportunities in the
1902  /// function.
1903  void registerAAs(bool IsModulePass);
1904 };
1905 
1906 Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
1907  if (!OMPInfoCache.ModuleSlice.count(&F))
1908  return nullptr;
1909 
1910  // Use a scope to keep the lifetime of the CachedKernel short.
1911  {
1912  Optional<Kernel> &CachedKernel = UniqueKernelMap[&F];
1913  if (CachedKernel)
1914  return *CachedKernel;
1915 
1916  // TODO: We should use an AA to create an (optimistic and callback
1917  // call-aware) call graph. For now we stick to simple patterns that
1918  // are less powerful, basically the worst fixpoint.
1919  if (isKernel(F)) {
1920  CachedKernel = Kernel(&F);
1921  return *CachedKernel;
1922  }
1923 
1924  CachedKernel = nullptr;
1925  if (!F.hasLocalLinkage()) {
1926 
1927  // See https://openmp.llvm.org/remarks/OptimizationRemarks.html
1928  auto Remark = [&](OptimizationRemarkAnalysis ORA) {
1929  return ORA << "Potentially unknown OpenMP target region caller.";
1930  };
1931  emitRemark<OptimizationRemarkAnalysis>(&F, "OMP100", Remark);
1932 
1933  return nullptr;
1934  }
1935  }
1936 
1937  auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel {
1938  if (auto *Cmp = dyn_cast<ICmpInst>(U.getUser())) {
1939  // Allow use in equality comparisons.
1940  if (Cmp->isEquality())
1941  return getUniqueKernelFor(*Cmp);
1942  return nullptr;
1943  }
1944  if (auto *CB = dyn_cast<CallBase>(U.getUser())) {
1945  // Allow direct calls.
1946  if (CB->isCallee(&U))
1947  return getUniqueKernelFor(*CB);
1948 
1949  OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
1950  OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
1951  // Allow the use in __kmpc_parallel_51 calls.
1952  if (OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI))
1953  return getUniqueKernelFor(*CB);
1954  return nullptr;
1955  }
1956  // Disallow every other use.
1957  return nullptr;
1958  };
1959 
1960  // TODO: In the future we want to track more than just a unique kernel.
1961  SmallPtrSet<Kernel, 2> PotentialKernels;
1962  OMPInformationCache::foreachUse(F, [&](const Use &U) {
1963  PotentialKernels.insert(GetUniqueKernelForUse(U));
1964  });
1965 
1966  Kernel K = nullptr;
1967  if (PotentialKernels.size() == 1)
1968  K = *PotentialKernels.begin();
1969 
1970  // Cache the result.
1971  UniqueKernelMap[&F] = K;
1972 
1973  return K;
1974 }
1975 
1976 bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
1977  OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
1978  OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
1979 
1980  bool Changed = false;
1981  if (!KernelParallelRFI)
1982  return Changed;
1983 
1984  // If we have disabled state machine changes, exit
1986  return Changed;
1987 
1988  for (Function *F : SCC) {
1989 
1990  // Check if the function is a use in a __kmpc_parallel_51 call at
1991  // all.
1992  bool UnknownUse = false;
1993  bool KernelParallelUse = false;
1994  unsigned NumDirectCalls = 0;
1995 
1996  SmallVector<Use *, 2> ToBeReplacedStateMachineUses;
1997  OMPInformationCache::foreachUse(*F, [&](Use &U) {
1998  if (auto *CB = dyn_cast<CallBase>(U.getUser()))
1999  if (CB->isCallee(&U)) {
2000  ++NumDirectCalls;
2001  return;
2002  }
2003 
2004  if (isa<ICmpInst>(U.getUser())) {
2005  ToBeReplacedStateMachineUses.push_back(&U);
2006  return;
2007  }
2008 
2009  // Find wrapper functions that represent parallel kernels.
2010  CallInst *CI =
2011  OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI);
2012  const unsigned int WrapperFunctionArgNo = 6;
2013  if (!KernelParallelUse && CI &&
2014  CI->getArgOperandNo(&U) == WrapperFunctionArgNo) {
2015  KernelParallelUse = true;
2016  ToBeReplacedStateMachineUses.push_back(&U);
2017  return;
2018  }
2019  UnknownUse = true;
2020  });
2021 
2022  // Do not emit a remark if we haven't seen a __kmpc_parallel_51
2023  // use.
2024  if (!KernelParallelUse)
2025  continue;
2026 
2027  // If this ever hits, we should investigate.
2028  // TODO: Checking the number of uses is not a necessary restriction and
2029  // should be lifted.
2030  if (UnknownUse || NumDirectCalls != 1 ||
2031  ToBeReplacedStateMachineUses.size() > 2) {
2032  auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2033  return ORA << "Parallel region is used in "
2034  << (UnknownUse ? "unknown" : "unexpected")
2035  << " ways. Will not attempt to rewrite the state machine.";
2036  };
2037  emitRemark<OptimizationRemarkAnalysis>(F, "OMP101", Remark);
2038  continue;
2039  }
2040 
2041  // Even if we have __kmpc_parallel_51 calls, we (for now) give
2042  // up if the function is not called from a unique kernel.
2043  Kernel K = getUniqueKernelFor(*F);
2044  if (!K) {
2045  auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2046  return ORA << "Parallel region is not called from a unique kernel. "
2047  "Will not attempt to rewrite the state machine.";
2048  };
2049  emitRemark<OptimizationRemarkAnalysis>(F, "OMP102", Remark);
2050  continue;
2051  }
2052 
2053  // We now know F is a parallel body function called only from the kernel K.
2054  // We also identified the state machine uses in which we replace the
2055  // function pointer by a new global symbol for identification purposes. This
2056  // ensures only direct calls to the function are left.
2057 
2058  Module &M = *F->getParent();
2059  Type *Int8Ty = Type::getInt8Ty(M.getContext());
2060 
2061  auto *ID = new GlobalVariable(
2062  M, Int8Ty, /* isConstant */ true, GlobalValue::PrivateLinkage,
2063  UndefValue::get(Int8Ty), F->getName() + ".ID");
2064 
2065  for (Use *U : ToBeReplacedStateMachineUses)
2067  ID, U->get()->getType()));
2068 
2069  ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
2070 
2071  Changed = true;
2072  }
2073 
2074  return Changed;
2075 }
2076 
2077 /// Abstract Attribute for tracking ICV values.
2078 struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {
2080  AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
2081 
2082  void initialize(Attributor &A) override {
2083  Function *F = getAnchorScope();
2084  if (!F || !A.isFunctionIPOAmendable(*F))
2085  indicatePessimisticFixpoint();
2086  }
2087 
2088  /// Returns true if value is assumed to be tracked.
2089  bool isAssumedTracked() const { return getAssumed(); }
2090 
2091  /// Returns true if value is known to be tracked.
2092  bool isKnownTracked() const { return getAssumed(); }
2093 
2094  /// Create an abstract attribute biew for the position \p IRP.
2095  static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A);
2096 
2097  /// Return the value with which \p I can be replaced for specific \p ICV.
2098  virtual Optional<Value *> getReplacementValue(InternalControlVar ICV,
2099  const Instruction *I,
2100  Attributor &A) const {
2101  return None;
2102  }
2103 
2104  /// Return an assumed unique ICV value if a single candidate is found. If
2105  /// there cannot be one, return a nullptr. If it is not clear yet, return the
2106  /// Optional::NoneType.
2107  virtual Optional<Value *>
2108  getUniqueReplacementValue(InternalControlVar ICV) const = 0;
2109 
2110  // Currently only nthreads is being tracked.
2111  // this array will only grow with time.
2112  InternalControlVar TrackableICVs[1] = {ICV_nthreads};
2113 
2114  /// See AbstractAttribute::getName()
2115  const std::string getName() const override { return "AAICVTracker"; }
2116 
2117  /// See AbstractAttribute::getIdAddr()
2118  const char *getIdAddr() const override { return &ID; }
2119 
2120  /// This function should return true if the type of the \p AA is AAICVTracker
2121  static bool classof(const AbstractAttribute *AA) {
2122  return (AA->getIdAddr() == &ID);
2123  }
2124 
2125  static const char ID;
2126 };
2127 
2128 struct AAICVTrackerFunction : public AAICVTracker {
2129  AAICVTrackerFunction(const IRPosition &IRP, Attributor &A)
2130  : AAICVTracker(IRP, A) {}
2131 
2132  // FIXME: come up with better string.
2133  const std::string getAsStr() const override { return "ICVTrackerFunction"; }
2134 
2135  // FIXME: come up with some stats.
2136  void trackStatistics() const override {}
2137 
2138  /// We don't manifest anything for this AA.
2139  ChangeStatus manifest(Attributor &A) override {
2140  return ChangeStatus::UNCHANGED;
2141  }
2142 
2143  // Map of ICV to their values at specific program point.
2145  InternalControlVar::ICV___last>
2146  ICVReplacementValuesMap;
2147 
2148  ChangeStatus updateImpl(Attributor &A) override {
2150 
2151  Function *F = getAnchorScope();
2152 
2153  auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2154 
2155  for (InternalControlVar ICV : TrackableICVs) {
2156  auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2157 
2158  auto &ValuesMap = ICVReplacementValuesMap[ICV];
2159  auto TrackValues = [&](Use &U, Function &) {
2160  CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
2161  if (!CI)
2162  return false;
2163 
2164  // FIXME: handle setters with more that 1 arguments.
2165  /// Track new value.
2166  if (ValuesMap.insert(std::make_pair(CI, CI->getArgOperand(0))).second)
2167  HasChanged = ChangeStatus::CHANGED;
2168 
2169  return false;
2170  };
2171 
2172  auto CallCheck = [&](Instruction &I) {
2173  Optional<Value *> ReplVal = getValueForCall(A, &I, ICV);
2174  if (ReplVal.hasValue() &&
2175  ValuesMap.insert(std::make_pair(&I, *ReplVal)).second)
2176  HasChanged = ChangeStatus::CHANGED;
2177 
2178  return true;
2179  };
2180 
2181  // Track all changes of an ICV.
2182  SetterRFI.foreachUse(TrackValues, F);
2183 
2184  bool UsedAssumedInformation = false;
2185  A.checkForAllInstructions(CallCheck, *this, {Instruction::Call},
2186  UsedAssumedInformation,
2187  /* CheckBBLivenessOnly */ true);
2188 
2189  /// TODO: Figure out a way to avoid adding entry in
2190  /// ICVReplacementValuesMap
2191  Instruction *Entry = &F->getEntryBlock().front();
2192  if (HasChanged == ChangeStatus::CHANGED && !ValuesMap.count(Entry))
2193  ValuesMap.insert(std::make_pair(Entry, nullptr));
2194  }
2195 
2196  return HasChanged;
2197  }
2198 
2199  /// Hepler to check if \p I is a call and get the value for it if it is
2200  /// unique.
2201  Optional<Value *> getValueForCall(Attributor &A, const Instruction *I,
2202  InternalControlVar &ICV) const {
2203 
2204  const auto *CB = dyn_cast<CallBase>(I);
2205  if (!CB || CB->hasFnAttr("no_openmp") ||
2206  CB->hasFnAttr("no_openmp_routines"))
2207  return None;
2208 
2209  auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2210  auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
2211  auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2212  Function *CalledFunction = CB->getCalledFunction();
2213 
2214  // Indirect call, assume ICV changes.
2215  if (CalledFunction == nullptr)
2216  return nullptr;
2217  if (CalledFunction == GetterRFI.Declaration)
2218  return None;
2219  if (CalledFunction == SetterRFI.Declaration) {
2220  if (ICVReplacementValuesMap[ICV].count(I))
2221  return ICVReplacementValuesMap[ICV].lookup(I);
2222 
2223  return nullptr;
2224  }
2225 
2226  // Since we don't know, assume it changes the ICV.
2227  if (CalledFunction->isDeclaration())
2228  return nullptr;
2229 
2230  const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
2232 
2233  if (ICVTrackingAA.isAssumedTracked())
2234  return ICVTrackingAA.getUniqueReplacementValue(ICV);
2235 
2236  // If we don't know, assume it changes.
2237  return nullptr;
2238  }
2239 
2240  // We don't check unique value for a function, so return None.
2242  getUniqueReplacementValue(InternalControlVar ICV) const override {
2243  return None;
2244  }
2245 
2246  /// Return the value with which \p I can be replaced for specific \p ICV.
2247  Optional<Value *> getReplacementValue(InternalControlVar ICV,
2248  const Instruction *I,
2249  Attributor &A) const override {
2250  const auto &ValuesMap = ICVReplacementValuesMap[ICV];
2251  if (ValuesMap.count(I))
2252  return ValuesMap.lookup(I);
2253 
2256  Worklist.push_back(I);
2257 
2258  Optional<Value *> ReplVal;
2259 
2260  while (!Worklist.empty()) {
2261  const Instruction *CurrInst = Worklist.pop_back_val();
2262  if (!Visited.insert(CurrInst).second)
2263  continue;
2264 
2265  const BasicBlock *CurrBB = CurrInst->getParent();
2266 
2267  // Go up and look for all potential setters/calls that might change the
2268  // ICV.
2269  while ((CurrInst = CurrInst->getPrevNode())) {
2270  if (ValuesMap.count(CurrInst)) {
2271  Optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst);
2272  // Unknown value, track new.
2273  if (!ReplVal.hasValue()) {
2274  ReplVal = NewReplVal;
2275  break;
2276  }
2277 
2278  // If we found a new value, we can't know the icv value anymore.
2279  if (NewReplVal.hasValue())
2280  if (ReplVal != NewReplVal)
2281  return nullptr;
2282 
2283  break;
2284  }
2285 
2286  Optional<Value *> NewReplVal = getValueForCall(A, CurrInst, ICV);
2287  if (!NewReplVal.hasValue())
2288  continue;
2289 
2290  // Unknown value, track new.
2291  if (!ReplVal.hasValue()) {
2292  ReplVal = NewReplVal;
2293  break;
2294  }
2295 
2296  // if (NewReplVal.hasValue())
2297  // We found a new value, we can't know the icv value anymore.
2298  if (ReplVal != NewReplVal)
2299  return nullptr;
2300  }
2301 
2302  // If we are in the same BB and we have a value, we are done.
2303  if (CurrBB == I->getParent() && ReplVal.hasValue())
2304  return ReplVal;
2305 
2306  // Go through all predecessors and add terminators for analysis.
2307  for (const BasicBlock *Pred : predecessors(CurrBB))
2308  if (const Instruction *Terminator = Pred->getTerminator())
2309  Worklist.push_back(Terminator);
2310  }
2311 
2312  return ReplVal;
2313  }
2314 };
2315 
2316 struct AAICVTrackerFunctionReturned : AAICVTracker {
2317  AAICVTrackerFunctionReturned(const IRPosition &IRP, Attributor &A)
2318  : AAICVTracker(IRP, A) {}
2319 
2320  // FIXME: come up with better string.
2321  const std::string getAsStr() const override {
2322  return "ICVTrackerFunctionReturned";
2323  }
2324 
2325  // FIXME: come up with some stats.
2326  void trackStatistics() const override {}
2327 
2328  /// We don't manifest anything for this AA.
2329  ChangeStatus manifest(Attributor &A) override {
2330  return ChangeStatus::UNCHANGED;
2331  }
2332 
2333  // Map of ICV to their values at specific program point.
2335  InternalControlVar::ICV___last>
2336  ICVReplacementValuesMap;
2337 
2338  /// Return the value with which \p I can be replaced for specific \p ICV.
2340  getUniqueReplacementValue(InternalControlVar ICV) const override {
2341  return ICVReplacementValuesMap[ICV];
2342  }
2343 
2344  ChangeStatus updateImpl(Attributor &A) override {
2346  const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
2347  *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
2348 
2349  if (!ICVTrackingAA.isAssumedTracked())
2350  return indicatePessimisticFixpoint();
2351 
2352  for (InternalControlVar ICV : TrackableICVs) {
2353  Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2354  Optional<Value *> UniqueICVValue;
2355 
2356  auto CheckReturnInst = [&](Instruction &I) {
2357  Optional<Value *> NewReplVal =
2358  ICVTrackingAA.getReplacementValue(ICV, &I, A);
2359 
2360  // If we found a second ICV value there is no unique returned value.
2361  if (UniqueICVValue.hasValue() && UniqueICVValue != NewReplVal)
2362  return false;
2363 
2364  UniqueICVValue = NewReplVal;
2365 
2366  return true;
2367  };
2368 
2369  bool UsedAssumedInformation = false;
2370  if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret},
2371  UsedAssumedInformation,
2372  /* CheckBBLivenessOnly */ true))
2373  UniqueICVValue = nullptr;
2374 
2375  if (UniqueICVValue == ReplVal)
2376  continue;
2377 
2378  ReplVal = UniqueICVValue;
2379  Changed = ChangeStatus::CHANGED;
2380  }
2381 
2382  return Changed;
2383  }
2384 };
2385 
2386 struct AAICVTrackerCallSite : AAICVTracker {
2387  AAICVTrackerCallSite(const IRPosition &IRP, Attributor &A)
2388  : AAICVTracker(IRP, A) {}
2389 
2390  void initialize(Attributor &A) override {
2391  Function *F = getAnchorScope();
2392  if (!F || !A.isFunctionIPOAmendable(*F))
2393  indicatePessimisticFixpoint();
2394 
2395  // We only initialize this AA for getters, so we need to know which ICV it
2396  // gets.
2397  auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2398  for (InternalControlVar ICV : TrackableICVs) {
2399  auto ICVInfo = OMPInfoCache.ICVs[ICV];
2400  auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter];
2401  if (Getter.Declaration == getAssociatedFunction()) {
2402  AssociatedICV = ICVInfo.Kind;
2403  return;
2404  }
2405  }
2406 
2407  /// Unknown ICV.
2408  indicatePessimisticFixpoint();
2409  }
2410 
2411  ChangeStatus manifest(Attributor &A) override {
2412  if (!ReplVal.hasValue() || !ReplVal.getValue())
2413  return ChangeStatus::UNCHANGED;
2414 
2415  A.changeValueAfterManifest(*getCtxI(), **ReplVal);
2416  A.deleteAfterManifest(*getCtxI());
2417 
2418  return ChangeStatus::CHANGED;
2419  }
2420 
2421  // FIXME: come up with better string.
2422  const std::string getAsStr() const override { return "ICVTrackerCallSite"; }
2423 
2424  // FIXME: come up with some stats.
2425  void trackStatistics() const override {}
2426 
2427  InternalControlVar AssociatedICV;
2428  Optional<Value *> ReplVal;
2429 
2430  ChangeStatus updateImpl(Attributor &A) override {
2431  const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
2432  *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
2433 
2434  // We don't have any information, so we assume it changes the ICV.
2435  if (!ICVTrackingAA.isAssumedTracked())
2436  return indicatePessimisticFixpoint();
2437 
2438  Optional<Value *> NewReplVal =
2439  ICVTrackingAA.getReplacementValue(AssociatedICV, getCtxI(), A);
2440 
2441  if (ReplVal == NewReplVal)
2442  return ChangeStatus::UNCHANGED;
2443 
2444  ReplVal = NewReplVal;
2445  return ChangeStatus::CHANGED;
2446  }
2447 
2448  // Return the value with which associated value can be replaced for specific
2449  // \p ICV.
2451  getUniqueReplacementValue(InternalControlVar ICV) const override {
2452  return ReplVal;
2453  }
2454 };
2455 
2456 struct AAICVTrackerCallSiteReturned : AAICVTracker {
2457  AAICVTrackerCallSiteReturned(const IRPosition &IRP, Attributor &A)
2458  : AAICVTracker(IRP, A) {}
2459 
2460  // FIXME: come up with better string.
2461  const std::string getAsStr() const override {
2462  return "ICVTrackerCallSiteReturned";
2463  }
2464 
2465  // FIXME: come up with some stats.
2466  void trackStatistics() const override {}
2467 
2468  /// We don't manifest anything for this AA.
2469  ChangeStatus manifest(Attributor &A) override {
2470  return ChangeStatus::UNCHANGED;
2471  }
2472 
2473  // Map of ICV to their values at specific program point.
2475  InternalControlVar::ICV___last>
2476  ICVReplacementValuesMap;
2477 
2478  /// Return the value with which associated value can be replaced for specific
2479  /// \p ICV.
2481  getUniqueReplacementValue(InternalControlVar ICV) const override {
2482  return ICVReplacementValuesMap[ICV];
2483  }
2484 
2485  ChangeStatus updateImpl(Attributor &A) override {
2487  const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
2488  *this, IRPosition::returned(*getAssociatedFunction()),
2490 
2491  // We don't have any information, so we assume it changes the ICV.
2492  if (!ICVTrackingAA.isAssumedTracked())
2493  return indicatePessimisticFixpoint();
2494 
2495  for (InternalControlVar ICV : TrackableICVs) {
2496  Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2497  Optional<Value *> NewReplVal =
2498  ICVTrackingAA.getUniqueReplacementValue(ICV);
2499 
2500  if (ReplVal == NewReplVal)
2501  continue;
2502 
2503  ReplVal = NewReplVal;
2504  Changed = ChangeStatus::CHANGED;
2505  }
2506  return Changed;
2507  }
2508 };
2509 
2510 struct AAExecutionDomainFunction : public AAExecutionDomain {
2511  AAExecutionDomainFunction(const IRPosition &IRP, Attributor &A)
2512  : AAExecutionDomain(IRP, A) {}
2513 
2514  const std::string getAsStr() const override {
2515  return "[AAExecutionDomain] " + std::to_string(SingleThreadedBBs.size()) +
2516  "/" + std::to_string(NumBBs) + " BBs thread 0 only.";
2517  }
2518 
2519  /// See AbstractAttribute::trackStatistics().
2520  void trackStatistics() const override {}
2521 
2522  void initialize(Attributor &A) override {
2523  Function *F = getAnchorScope();
2524  for (const auto &BB : *F)
2525  SingleThreadedBBs.insert(&BB);
2526  NumBBs = SingleThreadedBBs.size();
2527  }
2528 
2529  ChangeStatus manifest(Attributor &A) override {
2530  LLVM_DEBUG({
2531  for (const BasicBlock *BB : SingleThreadedBBs)
2532  dbgs() << TAG << " Basic block @" << getAnchorScope()->getName() << " "
2533  << BB->getName() << " is executed by a single thread.\n";
2534  });
2535  return ChangeStatus::UNCHANGED;
2536  }
2537 
2538  ChangeStatus updateImpl(Attributor &A) override;
2539 
2540  /// Check if an instruction is executed by a single thread.
2541  bool isExecutedByInitialThreadOnly(const Instruction &I) const override {
2542  return isExecutedByInitialThreadOnly(*I.getParent());
2543  }
2544 
2545  bool isExecutedByInitialThreadOnly(const BasicBlock &BB) const override {
2546  return isValidState() && SingleThreadedBBs.contains(&BB);
2547  }
2548 
2549  /// Set of basic blocks that are executed by a single thread.
2550  DenseSet<const BasicBlock *> SingleThreadedBBs;
2551 
2552  /// Total number of basic blocks in this function.
2553  long unsigned NumBBs;
2554 };
2555 
2556 ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
2557  Function *F = getAnchorScope();
2559  auto NumSingleThreadedBBs = SingleThreadedBBs.size();
2560 
2561  bool AllCallSitesKnown;
2562  auto PredForCallSite = [&](AbstractCallSite ACS) {
2563  const auto &ExecutionDomainAA = A.getAAFor<AAExecutionDomain>(
2564  *this, IRPosition::function(*ACS.getInstruction()->getFunction()),
2566  return ACS.isDirectCall() &&
2567  ExecutionDomainAA.isExecutedByInitialThreadOnly(
2568  *ACS.getInstruction());
2569  };
2570 
2571  if (!A.checkForAllCallSites(PredForCallSite, *this,
2572  /* RequiresAllCallSites */ true,
2573  AllCallSitesKnown))
2574  SingleThreadedBBs.erase(&F->getEntryBlock());
2575 
2576  auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2577  auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
2578 
2579  // Check if the edge into the successor block contains a condition that only
2580  // lets the main thread execute it.
2581  auto IsInitialThreadOnly = [&](BranchInst *Edge, BasicBlock *SuccessorBB) {
2582  if (!Edge || !Edge->isConditional())
2583  return false;
2584  if (Edge->getSuccessor(0) != SuccessorBB)
2585  return false;
2586 
2587  auto *Cmp = dyn_cast<CmpInst>(Edge->getCondition());
2588  if (!Cmp || !Cmp->isTrueWhenEqual() || !Cmp->isEquality())
2589  return false;
2590 
2591  ConstantInt *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
2592  if (!C)
2593  return false;
2594 
2595  // Match: -1 == __kmpc_target_init (for non-SPMD kernels only!)
2596  if (C->isAllOnesValue()) {
2597  auto *CB = dyn_cast<CallBase>(Cmp->getOperand(0));
2598  CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr;
2599  if (!CB)
2600  return false;
2601  const int InitModeArgNo = 1;
2602  auto *ModeCI = dyn_cast<ConstantInt>(CB->getOperand(InitModeArgNo));
2603  return ModeCI && (ModeCI->getSExtValue() & OMP_TGT_EXEC_MODE_GENERIC);
2604  }
2605 
2606  if (C->isZero()) {
2607  // Match: 0 == llvm.nvvm.read.ptx.sreg.tid.x()
2608  if (auto *II = dyn_cast<IntrinsicInst>(Cmp->getOperand(0)))
2609  if (II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_tid_x)
2610  return true;
2611 
2612  // Match: 0 == llvm.amdgcn.workitem.id.x()
2613  if (auto *II = dyn_cast<IntrinsicInst>(Cmp->getOperand(0)))
2614  if (II->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x)
2615  return true;
2616  }
2617 
2618  return false;
2619  };
2620 
2621  // Merge all the predecessor states into the current basic block. A basic
2622  // block is executed by a single thread if all of its predecessors are.
2623  auto MergePredecessorStates = [&](BasicBlock *BB) {
2624  if (pred_empty(BB))
2625  return SingleThreadedBBs.contains(BB);
2626 
2627  bool IsInitialThread = true;
2628  for (BasicBlock *PredBB : predecessors(BB)) {
2629  if (!IsInitialThreadOnly(dyn_cast<BranchInst>(PredBB->getTerminator()),
2630  BB))
2631  IsInitialThread &= SingleThreadedBBs.contains(PredBB);
2632  }
2633 
2634  return IsInitialThread;
2635  };
2636 
2637  for (auto *BB : RPOT) {
2638  if (!MergePredecessorStates(BB))
2639  SingleThreadedBBs.erase(BB);
2640  }
2641 
2642  return (NumSingleThreadedBBs == SingleThreadedBBs.size())
2645 }
2646 
2647 /// Try to replace memory allocation calls called by a single thread with a
2648 /// static buffer of shared memory.
2649 struct AAHeapToShared : public StateWrapper<BooleanState, AbstractAttribute> {
2651  AAHeapToShared(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
2652 
2653  /// Create an abstract attribute view for the position \p IRP.
2654  static AAHeapToShared &createForPosition(const IRPosition &IRP,
2655  Attributor &A);
2656 
2657  /// Returns true if HeapToShared conversion is assumed to be possible.
2658  virtual bool isAssumedHeapToShared(CallBase &CB) const = 0;
2659 
2660  /// Returns true if HeapToShared conversion is assumed and the CB is a
2661  /// callsite to a free operation to be removed.
2662  virtual bool isAssumedHeapToSharedRemovedFree(CallBase &CB) const = 0;
2663 
2664  /// See AbstractAttribute::getName().
2665  const std::string getName() const override { return "AAHeapToShared"; }
2666 
2667  /// See AbstractAttribute::getIdAddr().
2668  const char *getIdAddr() const override { return &ID; }
2669 
2670  /// This function should return true if the type of the \p AA is
2671  /// AAHeapToShared.
2672  static bool classof(const AbstractAttribute *AA) {
2673  return (AA->getIdAddr() == &ID);
2674  }
2675 
2676  /// Unique ID (due to the unique address)
2677  static const char ID;
2678 };
2679 
2680 struct AAHeapToSharedFunction : public AAHeapToShared {
2681  AAHeapToSharedFunction(const IRPosition &IRP, Attributor &A)
2682  : AAHeapToShared(IRP, A) {}
2683 
2684  const std::string getAsStr() const override {
2685  return "[AAHeapToShared] " + std::to_string(MallocCalls.size()) +
2686  " malloc calls eligible.";
2687  }
2688 
2689  /// See AbstractAttribute::trackStatistics().
2690  void trackStatistics() const override {}
2691 
2692  /// This functions finds free calls that will be removed by the
2693  /// HeapToShared transformation.
2694  void findPotentialRemovedFreeCalls(Attributor &A) {
2695  auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2696  auto &FreeRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
2697 
2698  PotentialRemovedFreeCalls.clear();
2699  // Update free call users of found malloc calls.
2700  for (CallBase *CB : MallocCalls) {
2701  SmallVector<CallBase *, 4> FreeCalls;
2702  for (auto *U : CB->users()) {
2703  CallBase *C = dyn_cast<CallBase>(U);
2704  if (C && C->getCalledFunction() == FreeRFI.Declaration)
2705  FreeCalls.push_back(C);
2706  }
2707 
2708  if (FreeCalls.size() != 1)
2709  continue;
2710 
2711  PotentialRemovedFreeCalls.insert(FreeCalls.front());
2712  }
2713  }
2714 
2715  void initialize(Attributor &A) override {
2716  auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2717  auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
2718 
2719  for (User *U : RFI.Declaration->users())
2720  if (CallBase *CB = dyn_cast<CallBase>(U))
2721  MallocCalls.insert(CB);
2722 
2723  findPotentialRemovedFreeCalls(A);
2724  }
2725 
2726  bool isAssumedHeapToShared(CallBase &CB) const override {
2727  return isValidState() && MallocCalls.count(&CB);
2728  }
2729 
2730  bool isAssumedHeapToSharedRemovedFree(CallBase &CB) const override {
2731  return isValidState() && PotentialRemovedFreeCalls.count(&CB);
2732  }
2733 
2734  ChangeStatus manifest(Attributor &A) override {
2735  if (MallocCalls.empty())
2736  return ChangeStatus::UNCHANGED;
2737 
2738  auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2739  auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
2740 
2741  Function *F = getAnchorScope();
2742  auto *HS = A.lookupAAFor<AAHeapToStack>(IRPosition::function(*F), this,
2744 
2746  for (CallBase *CB : MallocCalls) {
2747  // Skip replacing this if HeapToStack has already claimed it.
2748  if (HS && HS->isAssumedHeapToStack(*CB))
2749  continue;
2750 
2751  // Find the unique free call to remove it.
2752  SmallVector<CallBase *, 4> FreeCalls;
2753  for (auto *U : CB->users()) {
2754  CallBase *C = dyn_cast<CallBase>(U);
2755  if (C && C->getCalledFunction() == FreeCall.Declaration)
2756  FreeCalls.push_back(C);
2757  }
2758  if (FreeCalls.size() != 1)
2759  continue;
2760 
2761  ConstantInt *AllocSize = dyn_cast<ConstantInt>(CB->getArgOperand(0));
2762 
2763  LLVM_DEBUG(dbgs() << TAG << "Replace globalization call " << *CB
2764  << " with " << AllocSize->getZExtValue()
2765  << " bytes of shared memory\n");
2766 
2767  // Create a new shared memory buffer of the same size as the allocation
2768  // and replace all the uses of the original allocation with it.
2769  Module *M = CB->getModule();
2770  Type *Int8Ty = Type::getInt8Ty(M->getContext());
2771  Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue());
2772  auto *SharedMem = new GlobalVariable(
2773  *M, Int8ArrTy, /* IsConstant */ false, GlobalValue::InternalLinkage,
2774  UndefValue::get(Int8ArrTy), CB->getName(), nullptr,
2776  static_cast<unsigned>(AddressSpace::Shared));
2777  auto *NewBuffer =
2778  ConstantExpr::getPointerCast(SharedMem, Int8Ty->getPointerTo());
2779 
2780  auto Remark = [&](OptimizationRemark OR) {
2781  return OR << "Replaced globalized variable with "
2782  << ore::NV("SharedMemory", AllocSize->getZExtValue())
2783  << ((AllocSize->getZExtValue() != 1) ? " bytes " : " byte ")
2784  << "of shared memory.";
2785  };
2786  A.emitRemark<OptimizationRemark>(CB, "OMP111", Remark);
2787 
2788  SharedMem->setAlignment(MaybeAlign(32));
2789 
2790  A.changeValueAfterManifest(*CB, *NewBuffer);
2791  A.deleteAfterManifest(*CB);
2792  A.deleteAfterManifest(*FreeCalls.front());
2793 
2794  NumBytesMovedToSharedMemory += AllocSize->getZExtValue();
2795  Changed = ChangeStatus::CHANGED;
2796  }
2797 
2798  return Changed;
2799  }
2800 
2801  ChangeStatus updateImpl(Attributor &A) override {
2802  auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2803  auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
2804  Function *F = getAnchorScope();
2805 
2806  auto NumMallocCalls = MallocCalls.size();
2807 
2808  // Only consider malloc calls executed by a single thread with a constant.
2809  for (User *U : RFI.Declaration->users()) {
2810  const auto &ED = A.getAAFor<AAExecutionDomain>(
2812  if (CallBase *CB = dyn_cast<CallBase>(U))
2813  if (!isa<ConstantInt>(CB->getArgOperand(0)) ||
2814  !ED.isExecutedByInitialThreadOnly(*CB))
2815  MallocCalls.erase(CB);
2816  }
2817 
2818  findPotentialRemovedFreeCalls(A);
2819 
2820  if (NumMallocCalls != MallocCalls.size())
2821  return ChangeStatus::CHANGED;
2822 
2823  return ChangeStatus::UNCHANGED;
2824  }
2825 
2826  /// Collection of all malloc calls in a function.
2827  SmallPtrSet<CallBase *, 4> MallocCalls;
2828  /// Collection of potentially removed free calls in a function.
2829  SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls;
2830 };
2831 
2832 struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> {
2834  AAKernelInfo(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
2835 
2836  /// Statistics are tracked as part of manifest for now.
2837  void trackStatistics() const override {}
2838 
2839  /// See AbstractAttribute::getAsStr()
2840  const std::string getAsStr() const override {
2841  if (!isValidState())
2842  return "<invalid>";
2843  return std::string(SPMDCompatibilityTracker.isAssumed() ? "SPMD"
2844  : "generic") +
2845  std::string(SPMDCompatibilityTracker.isAtFixpoint() ? " [FIX]"
2846  : "") +
2847  std::string(" #PRs: ") +
2848  (ReachedKnownParallelRegions.isValidState()
2849  ? std::to_string(ReachedKnownParallelRegions.size())
2850  : "<invalid>") +
2851  ", #Unknown PRs: " +
2852  (ReachedUnknownParallelRegions.isValidState()
2853  ? std::to_string(ReachedUnknownParallelRegions.size())
2854  : "<invalid>") +
2855  ", #Reaching Kernels: " +
2856  (ReachingKernelEntries.isValidState()
2857  ? std::to_string(ReachingKernelEntries.size())
2858  : "<invalid>");
2859  }
2860 
2861  /// Create an abstract attribute biew for the position \p IRP.
2862  static AAKernelInfo &createForPosition(const IRPosition &IRP, Attributor &A);
2863 
2864  /// See AbstractAttribute::getName()
2865  const std::string getName() const override { return "AAKernelInfo"; }
2866 
2867  /// See AbstractAttribute::getIdAddr()
2868  const char *getIdAddr() const override { return &ID; }
2869 
2870  /// This function should return true if the type of the \p AA is AAKernelInfo
2871  static bool classof(const AbstractAttribute *AA) {
2872  return (AA->getIdAddr() == &ID);
2873  }
2874 
2875  static const char ID;
2876 };
2877 
2878 /// The function kernel info abstract attribute, basically, what can we say
2879 /// about a function with regards to the KernelInfoState.
2880 struct AAKernelInfoFunction : AAKernelInfo {
2881  AAKernelInfoFunction(const IRPosition &IRP, Attributor &A)
2882  : AAKernelInfo(IRP, A) {}
2883 
2884  SmallPtrSet<Instruction *, 4> GuardedInstructions;
2885 
2886  SmallPtrSetImpl<Instruction *> &getGuardedInstructions() {
2887  return GuardedInstructions;
2888  }
2889 
2890  /// See AbstractAttribute::initialize(...).
2891  void initialize(Attributor &A) override {
2892  // This is a high-level transform that might change the constant arguments
2893  // of the init and dinit calls. We need to tell the Attributor about this
2894  // to avoid other parts using the current constant value for simpliication.
2895  auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2896 
2897  Function *Fn = getAnchorScope();
2898  if (!OMPInfoCache.Kernels.count(Fn))
2899  return;
2900 
2901  // Add itself to the reaching kernel and set IsKernelEntry.
2902  ReachingKernelEntries.insert(Fn);
2903  IsKernelEntry = true;
2904 
2905  OMPInformationCache::RuntimeFunctionInfo &InitRFI =
2906  OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
2907  OMPInformationCache::RuntimeFunctionInfo &DeinitRFI =
2908  OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];
2909 
2910  // For kernels we perform more initialization work, first we find the init
2911  // and deinit calls.
2912  auto StoreCallBase = [](Use &U,
2913  OMPInformationCache::RuntimeFunctionInfo &RFI,
2914  CallBase *&Storage) {
2915  CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI);
2916  assert(CB &&
2917  "Unexpected use of __kmpc_target_init or __kmpc_target_deinit!");
2918  assert(!Storage &&
2919  "Multiple uses of __kmpc_target_init or __kmpc_target_deinit!");
2920  Storage = CB;
2921  return false;
2922  };
2923  InitRFI.foreachUse(
2924  [&](Use &U, Function &) {
2925  StoreCallBase(U, InitRFI, KernelInitCB);
2926  return false;
2927  },
2928  Fn);
2929  DeinitRFI.foreachUse(
2930  [&](Use &U, Function &) {
2931  StoreCallBase(U, DeinitRFI, KernelDeinitCB);
2932  return false;
2933  },
2934  Fn);
2935 
2936  // Ignore kernels without initializers such as global constructors.
2937  if (!KernelInitCB || !KernelDeinitCB) {
2938  indicateOptimisticFixpoint();
2939  return;
2940  }
2941 
2942  // For kernels we might need to initialize/finalize the IsSPMD state and
2943  // we need to register a simplification callback so that the Attributor
2944  // knows the constant arguments to __kmpc_target_init and
2945  // __kmpc_target_deinit might actually change.
2946 
2947  Attributor::SimplifictionCallbackTy StateMachineSimplifyCB =
2948  [&](const IRPosition &IRP, const AbstractAttribute *AA,
2949  bool &UsedAssumedInformation) -> Optional<Value *> {
2950  // IRP represents the "use generic state machine" argument of an
2951  // __kmpc_target_init call. We will answer this one with the internal
2952  // state. As long as we are not in an invalid state, we will create a
2953  // custom state machine so the value should be a `i1 false`. If we are
2954  // in an invalid state, we won't change the value that is in the IR.
2955  if (!ReachedKnownParallelRegions.isValidState())
2956  return nullptr;
2957  // If we have disabled state machine rewrites, don't make a custom one.
2959  return nullptr;
2960  if (AA)
2961  A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
2962  UsedAssumedInformation = !isAtFixpoint();
2963  auto *FalseVal =
2965  return FalseVal;
2966  };
2967 
2968  Attributor::SimplifictionCallbackTy ModeSimplifyCB =
2969  [&](const IRPosition &IRP, const AbstractAttribute *AA,
2970  bool &UsedAssumedInformation) -> Optional<Value *> {
2971  // IRP represents the "SPMDCompatibilityTracker" argument of an
2972  // __kmpc_target_init or
2973  // __kmpc_target_deinit call. We will answer this one with the internal
2974  // state.
2975  if (!SPMDCompatibilityTracker.isValidState())
2976  return nullptr;
2977  if (!SPMDCompatibilityTracker.isAtFixpoint()) {
2978  if (AA)
2979  A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
2980  UsedAssumedInformation = true;
2981  } else {
2982  UsedAssumedInformation = false;
2983  }
2984  auto *Val = ConstantInt::getSigned(
2986  SPMDCompatibilityTracker.isAssumed() ? OMP_TGT_EXEC_MODE_SPMD
2988  return Val;
2989  };
2990 
2991  Attributor::SimplifictionCallbackTy IsGenericModeSimplifyCB =
2992  [&](const IRPosition &IRP, const AbstractAttribute *AA,
2993  bool &UsedAssumedInformation) -> Optional<Value *> {
2994  // IRP represents the "RequiresFullRuntime" argument of an
2995  // __kmpc_target_init or __kmpc_target_deinit call. We will answer this
2996  // one with the internal state of the SPMDCompatibilityTracker, so if
2997  // generic then true, if SPMD then false.
2998  if (!SPMDCompatibilityTracker.isValidState())
2999  return nullptr;
3000  if (!SPMDCompatibilityTracker.isAtFixpoint()) {
3001  if (AA)
3002  A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
3003  UsedAssumedInformation = true;
3004  } else {
3005  UsedAssumedInformation = false;
3006  }
3007  auto *Val = ConstantInt::getBool(IRP.getAnchorValue().getContext(),
3008  !SPMDCompatibilityTracker.isAssumed());
3009  return Val;
3010  };
3011 
3012  constexpr const int InitModeArgNo = 1;
3013  constexpr const int DeinitModeArgNo = 1;
3014  constexpr const int InitUseStateMachineArgNo = 2;
3015  constexpr const int InitRequiresFullRuntimeArgNo = 3;
3016  constexpr const int DeinitRequiresFullRuntimeArgNo = 2;
3017  A.registerSimplificationCallback(
3018  IRPosition::callsite_argument(*KernelInitCB, InitUseStateMachineArgNo),
3019  StateMachineSimplifyCB);
3020  A.registerSimplificationCallback(
3021  IRPosition::callsite_argument(*KernelInitCB, InitModeArgNo),
3022  ModeSimplifyCB);
3023  A.registerSimplificationCallback(
3024  IRPosition::callsite_argument(*KernelDeinitCB, DeinitModeArgNo),
3025  ModeSimplifyCB);
3026  A.registerSimplificationCallback(
3027  IRPosition::callsite_argument(*KernelInitCB,
3028  InitRequiresFullRuntimeArgNo),
3029  IsGenericModeSimplifyCB);
3030  A.registerSimplificationCallback(
3031  IRPosition::callsite_argument(*KernelDeinitCB,
3032  DeinitRequiresFullRuntimeArgNo),
3033  IsGenericModeSimplifyCB);
3034 
3035  // Check if we know we are in SPMD-mode already.
3036  ConstantInt *ModeArg =
3037  dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitModeArgNo));
3038  if (ModeArg && (ModeArg->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD))
3039  SPMDCompatibilityTracker.indicateOptimisticFixpoint();
3040  // This is a generic region but SPMDization is disabled so stop tracking.
3041  else if (DisableOpenMPOptSPMDization)
3042  SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3043  }
3044 
3045  /// Sanitize the string \p S such that it is a suitable global symbol name.
3046  static std::string sanitizeForGlobalName(std::string S) {
3047  std::replace_if(
3048  S.begin(), S.end(),
3049  [](const char C) {
3050  return !((C >= 'a' && C <= 'z') || (C >= 'A' && C <= 'Z') ||
3051  (C >= '0' && C <= '9') || C == '_');
3052  },
3053  '.');
3054  return S;
3055  }
3056 
3057  /// Modify the IR based on the KernelInfoState as the fixpoint iteration is
3058  /// finished now.
3059  ChangeStatus manifest(Attributor &A) override {
3060  // If we are not looking at a kernel with __kmpc_target_init and
3061  // __kmpc_target_deinit call we cannot actually manifest the information.
3062  if (!KernelInitCB || !KernelDeinitCB)
3063  return ChangeStatus::UNCHANGED;
3064 
3065  // If we can we change the execution mode to SPMD-mode otherwise we build a
3066  // custom state machine.
3068  if (!changeToSPMDMode(A, Changed))
3069  return buildCustomStateMachine(A);
3070 
3071  return Changed;
3072  }
3073 
3074  bool changeToSPMDMode(Attributor &A, ChangeStatus &Changed) {
3075  auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3076 
3077  if (!SPMDCompatibilityTracker.isAssumed()) {
3078  for (Instruction *NonCompatibleI : SPMDCompatibilityTracker) {
3079  if (!NonCompatibleI)
3080  continue;
3081 
3082  // Skip diagnostics on calls to known OpenMP runtime functions for now.
3083  if (auto *CB = dyn_cast<CallBase>(NonCompatibleI))
3084  if (OMPInfoCache.RTLFunctions.contains(CB->getCalledFunction()))
3085  continue;
3086 
3087  auto Remark = [&](OptimizationRemarkAnalysis ORA) {
3088  ORA << "Value has potential side effects preventing SPMD-mode "
3089  "execution";
3090  if (isa<CallBase>(NonCompatibleI)) {
3091  ORA << ". Add `__attribute__((assume(\"ompx_spmd_amenable\")))` to "
3092  "the called function to override";
3093  }
3094  return ORA << ".";
3095  };
3096  A.emitRemark<OptimizationRemarkAnalysis>(NonCompatibleI, "OMP121",
3097  Remark);
3098 
3099  LLVM_DEBUG(dbgs() << TAG << "SPMD-incompatible side-effect: "
3100  << *NonCompatibleI << "\n");
3101  }
3102 
3103  return false;
3104  }
3105 
3106  // Check if the kernel is already in SPMD mode, if so, return success.
3107  Function *Kernel = getAnchorScope();
3109  (Kernel->getName() + "_exec_mode").str());
3110  assert(ExecMode && "Kernel without exec mode?");
3111  assert(ExecMode->getInitializer() && "ExecMode doesn't have initializer!");
3112 
3113  // Set the global exec mode flag to indicate SPMD-Generic mode.
3114  assert(isa<ConstantInt>(ExecMode->getInitializer()) &&
3115  "ExecMode is not an integer!");
3116  const int8_t ExecModeVal =
3117  cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue();
3118  if (ExecModeVal != OMP_TGT_EXEC_MODE_GENERIC)
3119  return true;
3120 
3121  // We will now unconditionally modify the IR, indicate a change.
3122  Changed = ChangeStatus::CHANGED;
3123 
3124  auto CreateGuardedRegion = [&](Instruction *RegionStartI,
3125  Instruction *RegionEndI) {
3126  LoopInfo *LI = nullptr;
3127  DominatorTree *DT = nullptr;
3128  MemorySSAUpdater *MSU = nullptr;
3129  using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
3130 
3131  BasicBlock *ParentBB = RegionStartI->getParent();
3132  Function *Fn = ParentBB->getParent();
3133  Module &M = *Fn->getParent();
3134 
3135  // Create all the blocks and logic.
3136  // ParentBB:
3137  // goto RegionCheckTidBB
3138  // RegionCheckTidBB:
3139  // Tid = __kmpc_hardware_thread_id()
3140  // if (Tid != 0)
3141  // goto RegionBarrierBB
3142  // RegionStartBB:
3143  // <execute instructions guarded>
3144  // goto RegionEndBB
3145  // RegionEndBB:
3146  // <store escaping values to shared mem>
3147  // goto RegionBarrierBB
3148  // RegionBarrierBB:
3149  // __kmpc_simple_barrier_spmd()
3150  // // second barrier is omitted if lacking escaping values.
3151  // <load escaping values from shared mem>
3152  // __kmpc_simple_barrier_spmd()
3153  // goto RegionExitBB
3154  // RegionExitBB:
3155  // <execute rest of instructions>
3156 
3157  BasicBlock *RegionEndBB = SplitBlock(ParentBB, RegionEndI->getNextNode(),
3158  DT, LI, MSU, "region.guarded.end");
3159  BasicBlock *RegionBarrierBB =
3160  SplitBlock(RegionEndBB, &*RegionEndBB->getFirstInsertionPt(), DT, LI,
3161  MSU, "region.barrier");
3162  BasicBlock *RegionExitBB =
3163  SplitBlock(RegionBarrierBB, &*RegionBarrierBB->getFirstInsertionPt(),
3164  DT, LI, MSU, "region.exit");
3165  BasicBlock *RegionStartBB =
3166  SplitBlock(ParentBB, RegionStartI, DT, LI, MSU, "region.guarded");
3167 
3168  assert(ParentBB->getUniqueSuccessor() == RegionStartBB &&
3169  "Expected a different CFG");
3170 
3171  BasicBlock *RegionCheckTidBB = SplitBlock(
3172  ParentBB, ParentBB->getTerminator(), DT, LI, MSU, "region.check.tid");
3173 
3174  // Register basic blocks with the Attributor.
3175  A.registerManifestAddedBasicBlock(*RegionEndBB);
3176  A.registerManifestAddedBasicBlock(*RegionBarrierBB);
3177  A.registerManifestAddedBasicBlock(*RegionExitBB);
3178  A.registerManifestAddedBasicBlock(*RegionStartBB);
3179  A.registerManifestAddedBasicBlock(*RegionCheckTidBB);
3180 
3181  bool HasBroadcastValues = false;
3182  // Find escaping outputs from the guarded region to outside users and
3183  // broadcast their values to them.
3184  for (Instruction &I : *RegionStartBB) {
3185  SmallPtrSet<Instruction *, 4> OutsideUsers;
3186  for (User *Usr : I.users()) {
3187  Instruction &UsrI = *cast<Instruction>(Usr);
3188  if (UsrI.getParent() != RegionStartBB)
3189  OutsideUsers.insert(&UsrI);
3190  }
3191 
3192  if (OutsideUsers.empty())
3193  continue;
3194 
3195  HasBroadcastValues = true;
3196 
3197  // Emit a global variable in shared memory to store the broadcasted
3198  // value.
3199  auto *SharedMem = new GlobalVariable(
3200  M, I.getType(), /* IsConstant */ false,
3202  sanitizeForGlobalName(
3203  (I.getName() + ".guarded.output.alloc").str()),
3204  nullptr, GlobalValue::NotThreadLocal,
3205  static_cast<unsigned>(AddressSpace::Shared));
3206 
3207  // Emit a store instruction to update the value.
3208  new StoreInst(&I, SharedMem, RegionEndBB->getTerminator());
3209 
3210  LoadInst *LoadI = new LoadInst(I.getType(), SharedMem,
3211  I.getName() + ".guarded.output.load",
3212  RegionBarrierBB->getTerminator());
3213 
3214  // Emit a load instruction and replace uses of the output value.
3215  for (Instruction *UsrI : OutsideUsers)
3216  UsrI->replaceUsesOfWith(&I, LoadI);
3217  }
3218 
3219  auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3220 
3221  // Go to tid check BB in ParentBB.
3222  const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
3223  ParentBB->getTerminator()->eraseFromParent();
3225  InsertPointTy(ParentBB, ParentBB->end()), DL);
3226  OMPInfoCache.OMPBuilder.updateToLocation(Loc);
3227  auto *SrcLocStr = OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc);
3228  Value *Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr);
3229  BranchInst::Create(RegionCheckTidBB, ParentBB)->setDebugLoc(DL);
3230 
3231  // Add check for Tid in RegionCheckTidBB
3232  RegionCheckTidBB->getTerminator()->eraseFromParent();
3233  OpenMPIRBuilder::LocationDescription LocRegionCheckTid(
3234  InsertPointTy(RegionCheckTidBB, RegionCheckTidBB->end()), DL);
3235  OMPInfoCache.OMPBuilder.updateToLocation(LocRegionCheckTid);
3236  FunctionCallee HardwareTidFn =
3237  OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
3238  M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
3239  Value *Tid =
3240  OMPInfoCache.OMPBuilder.Builder.CreateCall(HardwareTidFn, {});
3241  Value *TidCheck = OMPInfoCache.OMPBuilder.Builder.CreateIsNull(Tid);
3242  OMPInfoCache.OMPBuilder.Builder
3243  .CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB)
3244  ->setDebugLoc(DL);
3245 
3246  // First barrier for synchronization, ensures main thread has updated
3247  // values.
3248  FunctionCallee BarrierFn =
3249  OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
3250  M, OMPRTL___kmpc_barrier_simple_spmd);
3251  OMPInfoCache.OMPBuilder.updateToLocation(InsertPointTy(
3252  RegionBarrierBB, RegionBarrierBB->getFirstInsertionPt()));
3253  OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid})
3254  ->setDebugLoc(DL);
3255 
3256  // Second barrier ensures workers have read broadcast values.
3257  if (HasBroadcastValues)
3258  CallInst::Create(BarrierFn, {Ident, Tid}, "",
3259  RegionBarrierBB->getTerminator())
3260  ->setDebugLoc(DL);
3261  };
3262 
3263  auto &AllocSharedRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3265  for (Instruction *GuardedI : SPMDCompatibilityTracker) {
3266  BasicBlock *BB = GuardedI->getParent();
3267  if (!Visited.insert(BB).second)
3268  continue;
3269 
3271  Instruction *LastEffect = nullptr;
3272  BasicBlock::reverse_iterator IP = BB->rbegin(), IPEnd = BB->rend();
3273  while (++IP != IPEnd) {
3274  if (!IP->mayHaveSideEffects() && !IP->mayReadFromMemory())
3275  continue;
3276  Instruction *I = &*IP;
3277  if (OpenMPOpt::getCallIfRegularCall(*I, &AllocSharedRFI))
3278  continue;
3279  if (!I->user_empty() || !SPMDCompatibilityTracker.contains(I)) {
3280  LastEffect = nullptr;
3281  continue;
3282  }
3283  if (LastEffect)
3284  Reorders.push_back({I, LastEffect});
3285  LastEffect = &*IP;
3286  }
3287  for (auto &Reorder : Reorders)
3288  Reorder.first->moveBefore(Reorder.second);
3289  }
3290 
3292 
3293  for (Instruction *GuardedI : SPMDCompatibilityTracker) {
3294  BasicBlock *BB = GuardedI->getParent();
3295  auto *CalleeAA = A.lookupAAFor<AAKernelInfo>(
3296  IRPosition::function(*GuardedI->getFunction()), nullptr,
3298  assert(CalleeAA != nullptr && "Expected Callee AAKernelInfo");
3299  auto &CalleeAAFunction = *cast<AAKernelInfoFunction>(CalleeAA);
3300  // Continue if instruction is already guarded.
3301  if (CalleeAAFunction.getGuardedInstructions().contains(GuardedI))
3302  continue;
3303 
3304  Instruction *GuardedRegionStart = nullptr, *GuardedRegionEnd = nullptr;
3305  for (Instruction &I : *BB) {
3306  // If instruction I needs to be guarded update the guarded region
3307  // bounds.
3308  if (SPMDCompatibilityTracker.contains(&I)) {
3309  CalleeAAFunction.getGuardedInstructions().insert(&I);
3310  if (GuardedRegionStart)
3311  GuardedRegionEnd = &I;
3312  else
3313  GuardedRegionStart = GuardedRegionEnd = &I;
3314 
3315  continue;
3316  }
3317 
3318  // Instruction I does not need guarding, store
3319  // any region found and reset bounds.
3320  if (GuardedRegionStart) {
3321  GuardedRegions.push_back(
3322  std::make_pair(GuardedRegionStart, GuardedRegionEnd));
3323  GuardedRegionStart = nullptr;
3324  GuardedRegionEnd = nullptr;
3325  }
3326  }
3327  }
3328 
3329  for (auto &GR : GuardedRegions)
3330  CreateGuardedRegion(GR.first, GR.second);
3331 
3332  // Adjust the global exec mode flag that tells the runtime what mode this
3333  // kernel is executed in.
3334  assert(ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC &&
3335  "Initially non-SPMD kernel has SPMD exec mode!");
3336  ExecMode->setInitializer(
3337  ConstantInt::get(ExecMode->getInitializer()->getType(),
3338  ExecModeVal | OMP_TGT_EXEC_MODE_GENERIC_SPMD));
3339 
3340  // Next rewrite the init and deinit calls to indicate we use SPMD-mode now.
3341  const int InitModeArgNo = 1;
3342  const int DeinitModeArgNo = 1;
3343  const int InitUseStateMachineArgNo = 2;
3344  const int InitRequiresFullRuntimeArgNo = 3;
3345  const int DeinitRequiresFullRuntimeArgNo = 2;
3346 
3347  auto &Ctx = getAnchorValue().getContext();
3348  A.changeUseAfterManifest(
3349  KernelInitCB->getArgOperandUse(InitModeArgNo),
3352  A.changeUseAfterManifest(
3353  KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo),
3354  *ConstantInt::getBool(Ctx, 0));
3355  A.changeUseAfterManifest(
3356  KernelDeinitCB->getArgOperandUse(DeinitModeArgNo),
3359  A.changeUseAfterManifest(
3360  KernelInitCB->getArgOperandUse(InitRequiresFullRuntimeArgNo),
3361  *ConstantInt::getBool(Ctx, 0));
3362  A.changeUseAfterManifest(
3363  KernelDeinitCB->getArgOperandUse(DeinitRequiresFullRuntimeArgNo),
3364  *ConstantInt::getBool(Ctx, 0));
3365 
3366  ++NumOpenMPTargetRegionKernelsSPMD;
3367 
3368  auto Remark = [&](OptimizationRemark OR) {
3369  return OR << "Transformed generic-mode kernel to SPMD-mode.";
3370  };
3371  A.emitRemark<OptimizationRemark>(KernelInitCB, "OMP120", Remark);
3372  return true;
3373  };
3374 
3375  ChangeStatus buildCustomStateMachine(Attributor &A) {
3376  // If we have disabled state machine rewrites, don't make a custom one
3378  return ChangeStatus::UNCHANGED;
3379 
3380  // Don't rewrite the state machine if we are not in a valid state.
3381  if (!ReachedKnownParallelRegions.isValidState())
3382  return ChangeStatus::UNCHANGED;
3383 
3384  const int InitModeArgNo = 1;
3385  const int InitUseStateMachineArgNo = 2;
3386 
3387  // Check if the current configuration is non-SPMD and generic state machine.
3388  // If we already have SPMD mode or a custom state machine we do not need to
3389  // go any further. If it is anything but a constant something is weird and
3390  // we give up.
3391  ConstantInt *UseStateMachine = dyn_cast<ConstantInt>(
3392  KernelInitCB->getArgOperand(InitUseStateMachineArgNo));
3393  ConstantInt *Mode =
3394  dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitModeArgNo));
3395 
3396  // If we are stuck with generic mode, try to create a custom device (=GPU)
3397  // state machine which is specialized for the parallel regions that are
3398  // reachable by the kernel.
3399  if (!UseStateMachine || UseStateMachine->isZero() || !Mode ||
3400  (Mode->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD))
3401  return ChangeStatus::UNCHANGED;
3402 
3403  // If not SPMD mode, indicate we use a custom state machine now.
3404  auto &Ctx = getAnchorValue().getContext();
3405  auto *FalseVal = ConstantInt::getBool(Ctx, 0);
3406  A.changeUseAfterManifest(
3407  KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), *FalseVal);
3408 
3409  // If we don't actually need a state machine we are done here. This can
3410  // happen if there simply are no parallel regions. In the resulting kernel
3411  // all worker threads will simply exit right away, leaving the main thread
3412  // to do the work alone.
3413  if (!mayContainParallelRegion()) {
3414  ++NumOpenMPTargetRegionKernelsWithoutStateMachine;
3415 
3416  auto Remark = [&](OptimizationRemark OR) {
3417  return OR << "Removing unused state machine from generic-mode kernel.";
3418  };
3419  A.emitRemark<OptimizationRemark>(KernelInitCB, "OMP130", Remark);
3420 
3421  return ChangeStatus::CHANGED;
3422  }
3423 
3424  // Keep track in the statistics of our new shiny custom state machine.
3425  if (ReachedUnknownParallelRegions.empty()) {
3426  ++NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback;
3427 
3428  auto Remark = [&](OptimizationRemark OR) {
3429  return OR << "Rewriting generic-mode kernel with a customized state "
3430  "machine.";
3431  };
3432  A.emitRemark<OptimizationRemark>(KernelInitCB, "OMP131", Remark);
3433  } else {
3434  ++NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback;
3435 
3436  auto Remark = [&](OptimizationRemarkAnalysis OR) {
3437  return OR << "Generic-mode kernel is executed with a customized state "
3438  "machine that requires a fallback.";
3439  };
3440  A.emitRemark<OptimizationRemarkAnalysis>(KernelInitCB, "OMP132", Remark);
3441 
3442  // Tell the user why we ended up with a fallback.
3443  for (CallBase *UnknownParallelRegionCB : ReachedUnknownParallelRegions) {
3444  if (!UnknownParallelRegionCB)
3445  continue;
3446  auto Remark = [&](OptimizationRemarkAnalysis ORA) {
3447  return ORA << "Call may contain unknown parallel regions. Use "
3448  << "`__attribute__((assume(\"omp_no_parallelism\")))` to "
3449  "override.";
3450  };
3451  A.emitRemark<OptimizationRemarkAnalysis>(UnknownParallelRegionCB,
3452  "OMP133", Remark);
3453  }
3454  }
3455 
3456  // Create all the blocks:
3457  //
3458  // InitCB = __kmpc_target_init(...)
3459  // BlockHwSize =
3460  // __kmpc_get_hardware_num_threads_in_block();
3461  // WarpSize = __kmpc_get_warp_size();
3462  // BlockSize = BlockHwSize - WarpSize;
3463  // if (InitCB >= BlockSize) return;
3464  // IsWorkerCheckBB: bool IsWorker = InitCB >= 0;
3465  // if (IsWorker) {
3466  // SMBeginBB: __kmpc_barrier_simple_generic(...);
3467  // void *WorkFn;
3468  // bool Active = __kmpc_kernel_parallel(&WorkFn);
3469  // if (!WorkFn) return;
3470  // SMIsActiveCheckBB: if (Active) {
3471  // SMIfCascadeCurrentBB: if (WorkFn == <ParFn0>)
3472  // ParFn0(...);
3473  // SMIfCascadeCurrentBB: else if (WorkFn == <ParFn1>)
3474  // ParFn1(...);
3475  // ...
3476  // SMIfCascadeCurrentBB: else
3477  // ((WorkFnTy*)WorkFn)(...);
3478  // SMEndParallelBB: __kmpc_kernel_end_parallel(...);
3479  // }
3480  // SMDoneBB: __kmpc_barrier_simple_generic(...);
3481  // goto SMBeginBB;
3482  // }
3483  // UserCodeEntryBB: // user code
3484  // __kmpc_target_deinit(...)
3485  //
3486  Function *Kernel = getAssociatedFunction();
3487  assert(Kernel && "Expected an associated function!");
3488 
3489  BasicBlock *InitBB = KernelInitCB->getParent();
3490  BasicBlock *UserCodeEntryBB = InitBB->splitBasicBlock(
3491  KernelInitCB->getNextNode(), "thread.user_code.check");
3492  BasicBlock *IsWorkerCheckBB =
3493  BasicBlock::Create(Ctx, "is_worker_check", Kernel, UserCodeEntryBB);
3494  BasicBlock *StateMachineBeginBB = BasicBlock::Create(
3495  Ctx, "worker_state_machine.begin", Kernel, UserCodeEntryBB);
3496  BasicBlock *StateMachineFinishedBB = BasicBlock::Create(
3497  Ctx, "worker_state_machine.finished", Kernel, UserCodeEntryBB);
3498  BasicBlock *StateMachineIsActiveCheckBB = BasicBlock::Create(
3499  Ctx, "worker_state_machine.is_active.check", Kernel, UserCodeEntryBB);
3500  BasicBlock *StateMachineIfCascadeCurrentBB =
3501  BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check",
3502  Kernel, UserCodeEntryBB);
3503  BasicBlock *StateMachineEndParallelBB =
3504  BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.end",
3505  Kernel, UserCodeEntryBB);
3506  BasicBlock *StateMachineDoneBarrierBB = BasicBlock::Create(
3507  Ctx, "worker_state_machine.done.barrier", Kernel, UserCodeEntryBB);
3508  A.registerManifestAddedBasicBlock(*InitBB);
3509  A.registerManifestAddedBasicBlock(*UserCodeEntryBB);
3510  A.registerManifestAddedBasicBlock(*IsWorkerCheckBB);
3511  A.registerManifestAddedBasicBlock(*StateMachineBeginBB);
3512  A.registerManifestAddedBasicBlock(*StateMachineFinishedBB);
3513  A.registerManifestAddedBasicBlock(*StateMachineIsActiveCheckBB);
3514  A.registerManifestAddedBasicBlock(*StateMachineIfCascadeCurrentBB);
3515  A.registerManifestAddedBasicBlock(*StateMachineEndParallelBB);
3516  A.registerManifestAddedBasicBlock(*StateMachineDoneBarrierBB);
3517 
3518  const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
3519  ReturnInst::Create(Ctx, StateMachineFinishedBB)->setDebugLoc(DLoc);
3520  InitBB->getTerminator()->eraseFromParent();
3521 
3522  Module &M = *Kernel->getParent();
3523  auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3524  FunctionCallee BlockHwSizeFn =
3525  OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
3526  M, OMPRTL___kmpc_get_hardware_num_threads_in_block);
3527  FunctionCallee WarpSizeFn =
3528  OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
3529  M, OMPRTL___kmpc_get_warp_size);
3530  Instruction *BlockHwSize =
3531  CallInst::Create(BlockHwSizeFn, "block.hw_size", InitBB);
3532  BlockHwSize->setDebugLoc(DLoc);
3533  Instruction *WarpSize = CallInst::Create(WarpSizeFn, "warp.size", InitBB);
3534  WarpSize->setDebugLoc(DLoc);
3536  BinaryOperator::CreateSub(BlockHwSize, WarpSize, "block.size", InitBB);
3537  BlockSize->setDebugLoc(DLoc);
3538  Instruction *IsMainOrWorker =
3539  ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_SLT, KernelInitCB,
3540  BlockSize, "thread.is_main_or_worker", InitBB);
3541  IsMainOrWorker->setDebugLoc(DLoc);
3542  BranchInst::Create(IsWorkerCheckBB, StateMachineFinishedBB, IsMainOrWorker,
3543  InitBB);
3544 
3545  Instruction *IsWorker =
3546  ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, KernelInitCB,
3547  ConstantInt::get(KernelInitCB->getType(), -1),
3548  "thread.is_worker", IsWorkerCheckBB);
3549  IsWorker->setDebugLoc(DLoc);
3550  BranchInst::Create(StateMachineBeginBB, UserCodeEntryBB, IsWorker,
3551  IsWorkerCheckBB);
3552 
3553  // Create local storage for the work function pointer.
3554  const DataLayout &DL = M.getDataLayout();
3555  Type *VoidPtrTy = Type::getInt8PtrTy(Ctx);
3556  Instruction *WorkFnAI =
3557  new AllocaInst(VoidPtrTy, DL.getAllocaAddrSpace(), nullptr,
3558  "worker.work_fn.addr", &Kernel->getEntryBlock().front());
3559  WorkFnAI->setDebugLoc(DLoc);
3560 
3561  OMPInfoCache.OMPBuilder.updateToLocation(
3563  IRBuilder<>::InsertPoint(StateMachineBeginBB,
3564  StateMachineBeginBB->end()),
3565  DLoc));
3566 
3567  Value *Ident = KernelInitCB->getArgOperand(0);
3568  Value *GTid = KernelInitCB;
3569 
3570  FunctionCallee BarrierFn =
3571  OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
3572  M, OMPRTL___kmpc_barrier_simple_generic);
3573  CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineBeginBB)
3574  ->setDebugLoc(DLoc);
3575 
3576  if (WorkFnAI->getType()->getPointerAddressSpace() !=
3577  (unsigned int)AddressSpace::Generic) {
3578  WorkFnAI = new AddrSpaceCastInst(
3579  WorkFnAI,
3581  cast<PointerType>(WorkFnAI->getType()),
3582  (unsigned int)AddressSpace::Generic),
3583  WorkFnAI->getName() + ".generic", StateMachineBeginBB);
3584  WorkFnAI->setDebugLoc(DLoc);
3585  }
3586 
3587  FunctionCallee KernelParallelFn =
3588  OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
3589  M, OMPRTL___kmpc_kernel_parallel);
3590  Instruction *IsActiveWorker = CallInst::Create(
3591  KernelParallelFn, {WorkFnAI}, "worker.is_active", StateMachineBeginBB);
3592  IsActiveWorker->setDebugLoc(DLoc);
3593  Instruction *WorkFn = new LoadInst(VoidPtrTy, WorkFnAI, "worker.work_fn",
3594  StateMachineBeginBB);
3595  WorkFn->setDebugLoc(DLoc);
3596 
3597  FunctionType *ParallelRegionFnTy = FunctionType::get(
3599  false);
3601  WorkFn, ParallelRegionFnTy->getPointerTo(), "worker.work_fn.addr_cast",
3602  StateMachineBeginBB);
3603 
3604  Instruction *IsDone =
3605  ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_EQ, WorkFn,
3606  Constant::getNullValue(VoidPtrTy), "worker.is_done",
3607  StateMachineBeginBB);
3608  IsDone->setDebugLoc(DLoc);
3609  BranchInst::Create(StateMachineFinishedBB, StateMachineIsActiveCheckBB,
3610  IsDone, StateMachineBeginBB)
3611  ->setDebugLoc(DLoc);
3612 
3613  BranchInst::Create(StateMachineIfCascadeCurrentBB,
3614  StateMachineDoneBarrierBB, IsActiveWorker,
3615  StateMachineIsActiveCheckBB)
3616  ->setDebugLoc(DLoc);
3617 
3618  Value *ZeroArg =
3619  Constant::getNullValue(ParallelRegionFnTy->getParamType(0));
3620 
3621  // Now that we have most of the CFG skeleton it is time for the if-cascade
3622  // that checks the function pointer we got from the runtime against the
3623  // parallel regions we expect, if there are any.
3624  for (int I = 0, E = ReachedKnownParallelRegions.size(); I < E; ++I) {
3625  auto *ParallelRegion = ReachedKnownParallelRegions[I];
3626  BasicBlock *PRExecuteBB = BasicBlock::Create(
3627  Ctx, "worker_state_machine.parallel_region.execute", Kernel,
3628  StateMachineEndParallelBB);
3629  CallInst::Create(ParallelRegion, {ZeroArg, GTid}, "", PRExecuteBB)
3630  ->setDebugLoc(DLoc);
3631  BranchInst::Create(StateMachineEndParallelBB, PRExecuteBB)
3632  ->setDebugLoc(DLoc);
3633 
3634  BasicBlock *PRNextBB =
3635  BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check",
3636  Kernel, StateMachineEndParallelBB);
3637 
3638  // Check if we need to compare the pointer at all or if we can just
3639  // call the parallel region function.
3640  Value *IsPR;
3641  if (I + 1 < E || !ReachedUnknownParallelRegions.empty()) {
3642  Instruction *CmpI = ICmpInst::Create(
3643  ICmpInst::ICmp, llvm::CmpInst::ICMP_EQ, WorkFnCast, ParallelRegion,
3644  "worker.check_parallel_region", StateMachineIfCascadeCurrentBB);
3645  CmpI->setDebugLoc(DLoc);
3646  IsPR = CmpI;
3647  } else {
3648  IsPR = ConstantInt::getTrue(Ctx);
3649  }
3650 
3651  BranchInst::Create(PRExecuteBB, PRNextBB, IsPR,
3652  StateMachineIfCascadeCurrentBB)
3653  ->setDebugLoc(DLoc);
3654  StateMachineIfCascadeCurrentBB = PRNextBB;
3655  }
3656 
3657  // At the end of the if-cascade we place the indirect function pointer call
3658  // in case we might need it, that is if there can be parallel regions we
3659  // have not handled in the if-cascade above.
3660  if (!ReachedUnknownParallelRegions.empty()) {
3661  StateMachineIfCascadeCurrentBB->setName(
3662  "worker_state_machine.parallel_region.fallback.execute");
3663  CallInst::Create(ParallelRegionFnTy, WorkFnCast, {ZeroArg, GTid}, "",
3664  StateMachineIfCascadeCurrentBB)
3665  ->setDebugLoc(DLoc);
3666  }
3667  BranchInst::Create(StateMachineEndParallelBB,
3668  StateMachineIfCascadeCurrentBB)
3669  ->setDebugLoc(DLoc);
3670 
3671  CallInst::Create(OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
3672  M, OMPRTL___kmpc_kernel_end_parallel),
3673  {}, "", StateMachineEndParallelBB)
3674  ->setDebugLoc(DLoc);
3675  BranchInst::Create(StateMachineDoneBarrierBB, StateMachineEndParallelBB)
3676  ->setDebugLoc(DLoc);
3677 
3678  CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineDoneBarrierBB)
3679  ->setDebugLoc(DLoc);
3680  BranchInst::Create(StateMachineBeginBB, StateMachineDoneBarrierBB)
3681  ->setDebugLoc(DLoc);
3682 
3683  return ChangeStatus::CHANGED;
3684  }
3685 
3686  /// Fixpoint iteration update function. Will be called every time a dependence
3687  /// changed its state (and in the beginning).
3688  ChangeStatus updateImpl(Attributor &A) override {
3689  KernelInfoState StateBefore = getState();
3690 
3691  // Callback to check a read/write instruction.
3692  auto CheckRWInst = [&](Instruction &I) {
3693  // We handle calls later.
3694  if (isa<CallBase>(I))
3695  return true;
3696  // We only care about write effects.
3697  if (!I.mayWriteToMemory())
3698  return true;
3699  if (auto *SI = dyn_cast<StoreInst>(&I)) {
3701  getUnderlyingObjects(SI->getPointerOperand(), Objects);
3702  if (llvm::all_of(Objects,
3703  [](const Value *Obj) { return isa<AllocaInst>(Obj); }))
3704  return true;
3705  // Check for AAHeapToStack moved objects which must not be guarded.
3706  auto &HS = A.getAAFor<AAHeapToStack>(
3707  *this, IRPosition::function(*I.getFunction()),
3709  if (llvm::all_of(Objects, [&HS](const Value *Obj) {
3710  auto *CB = dyn_cast<CallBase>(Obj);
3711  if (!CB)
3712  return false;
3713  return HS.isAssumedHeapToStack(*CB);
3714  })) {
3715  return true;
3716  }
3717  }
3718 
3719  // Insert instruction that needs guarding.
3720  SPMDCompatibilityTracker.insert(&I);
3721  return true;
3722  };
3723 
3724  bool UsedAssumedInformationInCheckRWInst = false;
3725  if (!SPMDCompatibilityTracker.isAtFixpoint())
3726  if (!A.checkForAllReadWriteInstructions(
3727  CheckRWInst, *this, UsedAssumedInformationInCheckRWInst))
3728  SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3729 
3730  if (!IsKernelEntry) {
3731  updateReachingKernelEntries(A);
3732  updateParallelLevels(A);
3733 
3734  if (!ParallelLevels.isValidState())
3735  SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3736  }
3737 
3738  // Callback to check a call instruction.
3739  bool AllParallelRegionStatesWereFixed = true;
3740  bool AllSPMDStatesWereFixed = true;
3741  auto CheckCallInst = [&](Instruction &I) {
3742  auto &CB = cast<CallBase>(I);
3743  auto &CBAA = A.getAAFor<AAKernelInfo>(
3745  getState() ^= CBAA.getState();
3746  AllSPMDStatesWereFixed &= CBAA.SPMDCompatibilityTracker.isAtFixpoint();
3747  AllParallelRegionStatesWereFixed &=
3748  CBAA.ReachedKnownParallelRegions.isAtFixpoint();
3749  AllParallelRegionStatesWereFixed &=
3750  CBAA.ReachedUnknownParallelRegions.isAtFixpoint();
3751  return true;
3752  };
3753 
3754  bool UsedAssumedInformationInCheckCallInst = false;
3755  if (!A.checkForAllCallLikeInstructions(
3756  CheckCallInst, *this, UsedAssumedInformationInCheckCallInst)) {
3757  LLVM_DEBUG(dbgs() << TAG
3758  << "Failed to visit all call-like instructions!\n";);
3759  return indicatePessimisticFixpoint();
3760  }
3761 
3762  // If we haven't used any assumed information for the reached parallel
3763  // region states we can fix it.
3764  if (!UsedAssumedInformationInCheckCallInst &&
3765  AllParallelRegionStatesWereFixed) {
3766  ReachedKnownParallelRegions.indicateOptimisticFixpoint();
3767  ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
3768  }
3769 
3770  // If we are sure there are no parallel regions in the kernel we do not
3771  // want SPMD mode.
3772  if (IsKernelEntry && ReachedUnknownParallelRegions.isAtFixpoint() &&
3773  ReachedKnownParallelRegions.isAtFixpoint() &&
3774  ReachedUnknownParallelRegions.isValidState() &&
3775  ReachedKnownParallelRegions.isValidState() &&
3776  !mayContainParallelRegion())
3777  SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3778 
3779  // If we haven't used any assumed information for the SPMD state we can fix
3780  // it.
3781  if (!UsedAssumedInformationInCheckRWInst &&
3782  !UsedAssumedInformationInCheckCallInst && AllSPMDStatesWereFixed)
3783  SPMDCompatibilityTracker.indicateOptimisticFixpoint();
3784 
3785  return StateBefore == getState() ? ChangeStatus::UNCHANGED
3787  }
3788 
3789 private:
3790  /// Update info regarding reaching kernels.
3791  void updateReachingKernelEntries(Attributor &A) {
3792  auto PredCallSite = [&](AbstractCallSite ACS) {
3793  Function *Caller = ACS.getInstruction()->getFunction();
3794 
3795  assert(Caller && "Caller is nullptr");
3796 
3797  auto &CAA = A.getOrCreateAAFor<AAKernelInfo>(
3798  IRPosition::function(*Caller), this, DepClassTy::REQUIRED);
3799  if (CAA.ReachingKernelEntries.isValidState()) {
3800  ReachingKernelEntries ^= CAA.ReachingKernelEntries;
3801  return true;
3802  }
3803 
3804  // We lost track of the caller of the associated function, any kernel
3805  // could reach now.
3806  ReachingKernelEntries.indicatePessimisticFixpoint();
3807 
3808  return true;
3809  };
3810 
3811  bool AllCallSitesKnown;
3812  if (!A.checkForAllCallSites(PredCallSite, *this,
3813  true /* RequireAllCallSites */,
3814  AllCallSitesKnown))
3815  ReachingKernelEntries.indicatePessimisticFixpoint();
3816  }
3817 
3818  /// Update info regarding parallel levels.
3819  void updateParallelLevels(Attributor &A) {
3820  auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3821  OMPInformationCache::RuntimeFunctionInfo &Parallel51RFI =
3822  OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
3823 
3824  auto PredCallSite = [&](AbstractCallSite ACS) {
3825  Function *Caller = ACS.getInstruction()->getFunction();
3826 
3827  assert(Caller && "Caller is nullptr");
3828 
3829  auto &CAA =
3830  A.getOrCreateAAFor<AAKernelInfo>(IRPosition::function(*Caller));
3831  if (CAA.ParallelLevels.isValidState()) {
3832  // Any function that is called by `__kmpc_parallel_51` will not be
3833  // folded as the parallel level in the function is updated. In order to
3834  // get it right, all the analysis would depend on the implentation. That
3835  // said, if in the future any change to the implementation, the analysis
3836  // could be wrong. As a consequence, we are just conservative here.
3837  if (Caller == Parallel51RFI.Declaration) {
3838  ParallelLevels.indicatePessimisticFixpoint();
3839  return true;
3840  }
3841 
3842  ParallelLevels ^= CAA.ParallelLevels;
3843 
3844  return true;
3845  }
3846 
3847  // We lost track of the caller of the associated function, any kernel
3848  // could reach now.
3849  ParallelLevels.indicatePessimisticFixpoint();
3850 
3851  return true;
3852  };
3853 
3854  bool AllCallSitesKnown = true;
3855  if (!A.checkForAllCallSites(PredCallSite, *this,
3856  true /* RequireAllCallSites */,
3857  AllCallSitesKnown))
3858  ParallelLevels.indicatePessimisticFixpoint();
3859  }
3860 };
3861 
3862 /// The call site kernel info abstract attribute, basically, what can we say
3863 /// about a call site with regards to the KernelInfoState. For now this simply
3864 /// forwards the information from the callee.
3865 struct AAKernelInfoCallSite : AAKernelInfo {
3866  AAKernelInfoCallSite(const IRPosition &IRP, Attributor &A)
3867  : AAKernelInfo(IRP, A) {}
3868 
3869  /// See AbstractAttribute::initialize(...).
3870  void initialize(Attributor &A) override {
3872 
3873  CallBase &CB = cast<CallBase>(getAssociatedValue());
3874  Function *Callee = getAssociatedFunction();
3875 
3876  auto &AssumptionAA = A.getAAFor<AAAssumptionInfo>(
3878 
3879  // Check for SPMD-mode assumptions.
3880  if (AssumptionAA.hasAssumption("ompx_spmd_amenable")) {
3881  SPMDCompatibilityTracker.indicateOptimisticFixpoint();
3882  indicateOptimisticFixpoint();
3883  }
3884 
3885  // First weed out calls we do not care about, that is readonly/readnone
3886  // calls, intrinsics, and "no_openmp" calls. Neither of these can reach a
3887  // parallel region or anything else we are looking for.
3888  if (!CB.mayWriteToMemory() || isa<IntrinsicInst>(CB)) {
3889  indicateOptimisticFixpoint();
3890  return;
3891  }
3892 
3893  // Next we check if we know the callee. If it is a known OpenMP function
3894  // we will handle them explicitly in the switch below. If it is not, we
3895  // will use an AAKernelInfo object on the callee to gather information and
3896  // merge that into the current state. The latter happens in the updateImpl.
3897  auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3898  const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
3899  if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
3900  // Unknown caller or declarations are not analyzable, we give up.
3901  if (!Callee || !A.isFunctionIPOAmendable(*Callee)) {
3902 
3903  // Unknown callees might contain parallel regions, except if they have
3904  // an appropriate assumption attached.
3905  if (!(AssumptionAA.hasAssumption("omp_no_openmp") ||
3906  AssumptionAA.hasAssumption("omp_no_parallelism")))
3907  ReachedUnknownParallelRegions.insert(&CB);
3908 
3909  // If SPMDCompatibilityTracker is not fixed, we need to give up on the
3910  // idea we can run something unknown in SPMD-mode.
3911  if (!SPMDCompatibilityTracker.isAtFixpoint()) {
3912  SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3913  SPMDCompatibilityTracker.insert(&CB);
3914  }
3915 
3916  // We have updated the state for this unknown call properly, there won't
3917  // be any change so we indicate a fixpoint.
3918  indicateOptimisticFixpoint();
3919  }
3920  // If the callee is known and can be used in IPO, we will update the state
3921  // based on the callee state in updateImpl.
3922  return;
3923  }
3924 
3925  const unsigned int WrapperFunctionArgNo = 6;
3926  RuntimeFunction RF = It->getSecond();
3927  switch (RF) {
3928  // All the functions we know are compatible with SPMD mode.
3929  case OMPRTL___kmpc_is_spmd_exec_mode:
3930  case OMPRTL___kmpc_distribute_static_fini:
3931  case OMPRTL___kmpc_for_static_fini:
3932  case OMPRTL___kmpc_global_thread_num:
3933  case OMPRTL___kmpc_get_hardware_num_threads_in_block:
3934  case OMPRTL___kmpc_get_hardware_num_blocks:
3935  case OMPRTL___kmpc_single:
3936  case OMPRTL___kmpc_end_single:
3937  case OMPRTL___kmpc_master:
3938  case OMPRTL___kmpc_end_master:
3939  case OMPRTL___kmpc_barrier:
3940  break;
3941  case OMPRTL___kmpc_distribute_static_init_4:
3942  case OMPRTL___kmpc_distribute_static_init_4u:
3943  case OMPRTL___kmpc_distribute_static_init_8:
3944  case OMPRTL___kmpc_distribute_static_init_8u:
3945  case OMPRTL___kmpc_for_static_init_4:
3946  case OMPRTL___kmpc_for_static_init_4u:
3947  case OMPRTL___kmpc_for_static_init_8:
3948  case OMPRTL___kmpc_for_static_init_8u: {
3949  // Check the schedule and allow static schedule in SPMD mode.
3950  unsigned ScheduleArgOpNo = 2;
3951  auto *ScheduleTypeCI =
3952  dyn_cast<ConstantInt>(CB.getArgOperand(ScheduleArgOpNo));
3953  unsigned ScheduleTypeVal =
3954  ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0;
3955  switch (OMPScheduleType(ScheduleTypeVal)) {
3960  break;
3961  default:
3962  SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3963  SPMDCompatibilityTracker.insert(&CB);
3964  break;
3965  };
3966  } break;
3967  case OMPRTL___kmpc_target_init:
3968  KernelInitCB = &CB;
3969  break;
3970  case OMPRTL___kmpc_target_deinit:
3971  KernelDeinitCB = &CB;
3972  break;
3973  case OMPRTL___kmpc_parallel_51:
3974  if (auto *ParallelRegion = dyn_cast<Function>(
3975  CB.getArgOperand(WrapperFunctionArgNo)->stripPointerCasts())) {
3976  ReachedKnownParallelRegions.insert(ParallelRegion);
3977  break;
3978  }
3979  // The condition above should usually get the parallel region function
3980  // pointer and record it. In the off chance it doesn't we assume the
3981  // worst.
3982  ReachedUnknownParallelRegions.insert(&CB);
3983  break;
3984  case OMPRTL___kmpc_omp_task:
3985  // We do not look into tasks right now, just give up.
3986  SPMDCompatibilityTracker.insert(&CB);
3987  ReachedUnknownParallelRegions.insert(&CB);
3988  break;
3989  case OMPRTL___kmpc_alloc_shared:
3990  case OMPRTL___kmpc_free_shared:
3991  // Return without setting a fixpoint, to be resolved in updateImpl.
3992  return;
3993  default:
3994  // Unknown OpenMP runtime calls cannot be executed in SPMD-mode,
3995  // generally. However, they do not hide parallel regions.
3996  SPMDCompatibilityTracker.insert(&CB);
3997  break;
3998  }
3999  // All other OpenMP runtime calls will not reach parallel regions so they
4000  // can be safely ignored for now. Since it is a known OpenMP runtime call we
4001  // have now modeled all effects and there is no need for any update.
4002  indicateOptimisticFixpoint();
4003  }
4004 
4005  ChangeStatus updateImpl(Attributor &A) override {
4006  // TODO: Once we have call site specific value information we can provide
4007  // call site specific liveness information and then it makes
4008  // sense to specialize attributes for call sites arguments instead of
4009  // redirecting requests to the callee argument.
4010  Function *F = getAssociatedFunction();
4011 
4012  auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
4013  const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(F);
4014 
4015  // If F is not a runtime function, propagate the AAKernelInfo of the callee.
4016  if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
4017  const IRPosition &FnPos = IRPosition::function(*F);
4018  auto &FnAA = A.getAAFor<AAKernelInfo>(*this, FnPos, DepClassTy::REQUIRED);
4019  if (getState() == FnAA.getState())
4020  return ChangeStatus::UNCHANGED;
4021  getState() = FnAA.getState();
4022  return ChangeStatus::CHANGED;
4023  }
4024 
4025  // F is a runtime function that allocates or frees memory, check
4026  // AAHeapToStack and AAHeapToShared.
4027  KernelInfoState StateBefore = getState();
4028  assert((It->getSecond() == OMPRTL___kmpc_alloc_shared ||
4029  It->getSecond() == OMPRTL___kmpc_free_shared) &&
4030  "Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call");
4031 
4032  CallBase &CB = cast<CallBase>(getAssociatedValue());
4033 
4034  auto &HeapToStackAA = A.getAAFor<AAHeapToStack>(
4036  auto &HeapToSharedAA = A.getAAFor<AAHeapToShared>(
4038 
4039  RuntimeFunction RF = It->getSecond();
4040 
4041  switch (RF) {
4042  // If neither HeapToStack nor HeapToShared assume the call is removed,
4043  // assume SPMD incompatibility.
4044  case OMPRTL___kmpc_alloc_shared:
4045  if (!HeapToStackAA.isAssumedHeapToStack(CB) &&
4046  !HeapToSharedAA.isAssumedHeapToShared(CB))
4047  SPMDCompatibilityTracker.insert(&CB);
4048  break;
4049  case OMPRTL___kmpc_free_shared:
4050  if (!HeapToStackAA.isAssumedHeapToStackRemovedFree(CB) &&
4051  !HeapToSharedAA.isAssumedHeapToSharedRemovedFree(CB))
4052  SPMDCompatibilityTracker.insert(&CB);
4053  break;
4054  default:
4055  SPMDCompatibilityTracker.insert(&CB);
4056  }
4057 
4058  return StateBefore == getState() ? ChangeStatus::UNCHANGED
4060  }
4061 };
4062 
4063 struct AAFoldRuntimeCall
4064  : public StateWrapper<BooleanState, AbstractAttribute> {
4066 
4067  AAFoldRuntimeCall(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
4068 
4069  /// Statistics are tracked as part of manifest for now.
4070  void trackStatistics() const override {}
4071 
4072  /// Create an abstract attribute biew for the position \p IRP.
4073  static AAFoldRuntimeCall &createForPosition(const IRPosition &IRP,
4074  Attributor &A);
4075 
4076  /// See AbstractAttribute::getName()
4077  const std::string getName() const override { return "AAFoldRuntimeCall"; }
4078 
4079  /// See AbstractAttribute::getIdAddr()
4080  const char *getIdAddr() const override { return &ID; }
4081 
4082  /// This function should return true if the type of the \p AA is
4083  /// AAFoldRuntimeCall
4084  static bool classof(const AbstractAttribute *AA) {
4085  return (AA->getIdAddr() == &ID);
4086  }
4087 
4088  static const char ID;
4089 };
4090 
4091 struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
4092  AAFoldRuntimeCallCallSiteReturned(const IRPosition &IRP, Attributor &A)
4093  : AAFoldRuntimeCall(IRP, A) {}
4094 
4095  /// See AbstractAttribute::getAsStr()
4096  const std::string getAsStr() const override {
4097  if (!isValidState())
4098  return "<invalid>";
4099 
4100  std::string Str("simplified value: ");
4101 
4102  if (!SimplifiedValue.hasValue())
4103  return Str + std::string("none");
4104 
4105  if (!SimplifiedValue.getValue())
4106  return Str + std::string("nullptr");
4107 
4108  if (ConstantInt *CI = dyn_cast<ConstantInt>(SimplifiedValue.getValue()))
4109  return Str + std::to_string(CI->getSExtValue());
4110 
4111  return Str + std::string("unknown");
4112  }
4113 
4114  void initialize(Attributor &A) override {
4116  indicatePessimisticFixpoint();
4117 
4118  Function *Callee = getAssociatedFunction();
4119 
4120  auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
4121  const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
4122  assert(It != OMPInfoCache.RuntimeFunctionIDMap.end() &&
4123  "Expected a known OpenMP runtime function");
4124 
4125  RFKind = It->getSecond();
4126 
4127  CallBase &CB = cast<CallBase>(getAssociatedValue());
4128  A.registerSimplificationCallback(
4130  [&](const IRPosition &IRP, const AbstractAttribute *AA,
4131  bool &UsedAssumedInformation) -> Optional<Value *> {
4132  assert((isValidState() || (SimplifiedValue.hasValue() &&
4133  SimplifiedValue.getValue() == nullptr)) &&
4134  "Unexpected invalid state!");
4135 
4136  if (!isAtFixpoint()) {
4137  UsedAssumedInformation = true;
4138  if (AA)
4139  A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
4140  }
4141  return SimplifiedValue;
4142  });
4143  }
4144 
4145  ChangeStatus updateImpl(Attributor &A) override {
4147  switch (RFKind) {
4148  case OMPRTL___kmpc_is_spmd_exec_mode:
4149  Changed |= foldIsSPMDExecMode(A);
4150  break;
4151  case OMPRTL___kmpc_is_generic_main_thread_id:
4152  Changed |= foldIsGenericMainThread(A);
4153  break;
4154  case OMPRTL___kmpc_parallel_level:
4155  Changed |= foldParallelLevel(A);
4156  break;
4157  case OMPRTL___kmpc_get_hardware_num_threads_in_block:
4158  Changed = Changed | foldKernelFnAttribute(A, "omp_target_thread_limit");
4159  break;
4160  case OMPRTL___kmpc_get_hardware_num_blocks:
4161  Changed = Changed | foldKernelFnAttribute(A, "omp_target_num_teams");
4162  break;
4163  default:
4164  llvm_unreachable("Unhandled OpenMP runtime function!");
4165  }
4166 
4167  return Changed;
4168  }
4169 
4170  ChangeStatus manifest(Attributor &A) override {
4172 
4173  if (SimplifiedValue.hasValue() && SimplifiedValue.getValue()) {
4174  Instruction &I = *getCtxI();
4175  A.changeValueAfterManifest(I, **SimplifiedValue);
4176  A.deleteAfterManifest(I);
4177 
4178  CallBase *CB = dyn_cast<CallBase>(&I);
4179  auto Remark = [&](OptimizationRemark OR) {
4180  if (auto *C = dyn_cast<ConstantInt>(*SimplifiedValue))
4181  return OR << "Replacing OpenMP runtime call "
4182  << CB->getCalledFunction()->getName() << " with "
4183  << ore::NV("FoldedValue", C->getZExtValue()) << ".";
4184  return OR << "Replacing OpenMP runtime call "
4185  << CB->getCalledFunction()->getName() << ".";
4186  };
4187 
4188  if (CB && EnableVerboseRemarks)
4189  A.emitRemark<OptimizationRemark>(CB, "OMP180", Remark);
4190 
4191  LLVM_DEBUG(dbgs() << TAG << "Replacing runtime call: " << I << " with "
4192  << **SimplifiedValue << "\n");
4193 
4194  Changed = ChangeStatus::CHANGED;
4195  }
4196 
4197  return Changed;
4198  }
4199 
4200  ChangeStatus indicatePessimisticFixpoint() override {
4201  SimplifiedValue = nullptr;
4202  return AAFoldRuntimeCall::indicatePessimisticFixpoint();
4203  }
4204 
4205 private:
4206  /// Fold __kmpc_is_spmd_exec_mode into a constant if possible.
4207  ChangeStatus foldIsSPMDExecMode(Attributor &A) {
4208  Optional<Value *> SimplifiedValueBefore = SimplifiedValue;
4209 
4210  unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
4211  unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
4212  auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
4213  *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
4214 
4215  if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState())
4216  return indicatePessimisticFixpoint();
4217 
4218  for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) {
4219  auto &AA = A.getAAFor<AAKernelInfo>(*this, IRPosition::function(*K),
4221 
4222  if (!AA.isValidState()) {
4223  SimplifiedValue = nullptr;
4224  return indicatePessimisticFixpoint();
4225  }
4226 
4227  if (AA.SPMDCompatibilityTracker.isAssumed()) {
4228  if (AA.SPMDCompatibilityTracker.isAtFixpoint())
4229  ++KnownSPMDCount;
4230  else
4231  ++AssumedSPMDCount;
4232  } else {
4233  if (AA.SPMDCompatibilityTracker.isAtFixpoint())
4234  ++KnownNonSPMDCount;
4235  else
4236  ++AssumedNonSPMDCount;
4237  }
4238  }
4239 
4240  if ((AssumedSPMDCount + KnownSPMDCount) &&
4241  (AssumedNonSPMDCount + KnownNonSPMDCount))
4242  return indicatePessimisticFixpoint();
4243 
4244  auto &Ctx = getAnchorValue().getContext();
4245  if (KnownSPMDCount || AssumedSPMDCount) {
4246  assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
4247  "Expected only SPMD kernels!");
4248  // All reaching kernels are in SPMD mode. Update all function calls to
4249  // __kmpc_is_spmd_exec_mode to 1.
4250  SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), true);
4251  } else if (KnownNonSPMDCount || AssumedNonSPMDCount) {
4252  assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
4253  "Expected only non-SPMD kernels!");
4254  // All reaching kernels are in non-SPMD mode. Update all function
4255  // calls to __kmpc_is_spmd_exec_mode to 0.
4256  SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), false);
4257  } else {
4258  // We have empty reaching kernels, therefore we cannot tell if the
4259  // associated call site can be folded. At this moment, SimplifiedValue
4260  // must be none.
4261  assert(!SimplifiedValue.hasValue() && "SimplifiedValue should be none");
4262  }
4263 
4264  return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
4266  }
4267 
4268  /// Fold __kmpc_is_generic_main_thread_id into a constant if possible.
4269  ChangeStatus foldIsGenericMainThread(Attributor &A) {
4270  Optional<Value *> SimplifiedValueBefore = SimplifiedValue;
4271 
4272  CallBase &CB = cast<CallBase>(getAssociatedValue());
4273  Function *F = CB.getFunction();
4274  const auto &ExecutionDomainAA = A.getAAFor<AAExecutionDomain>(
4276 
4277  if (!ExecutionDomainAA.isValidState())
4278  return indicatePessimisticFixpoint();
4279 
4280  auto &Ctx = getAnchorValue().getContext();
4281  if (ExecutionDomainAA.isExecutedByInitialThreadOnly(CB))
4282  SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), true);
4283  else
4284  return indicatePessimisticFixpoint();
4285 
4286  return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
4288  }
4289 
4290  /// Fold __kmpc_parallel_level into a constant if possible.
4291  ChangeStatus foldParallelLevel(Attributor &A) {
4292  Optional<Value *> SimplifiedValueBefore = SimplifiedValue;
4293 
4294  auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
4295  *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
4296 
4297  if (!CallerKernelInfoAA.ParallelLevels.isValidState())
4298  return indicatePessimisticFixpoint();
4299 
4300  if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState())
4301  return indicatePessimisticFixpoint();
4302 
4303  if (CallerKernelInfoAA.ReachingKernelEntries.empty()) {
4304  assert(!SimplifiedValue.hasValue() &&
4305  "SimplifiedValue should keep none at this point");
4306  return ChangeStatus::UNCHANGED;
4307  }
4308 
4309  unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
4310  unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
4311  for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) {
4312  auto &AA = A.getAAFor<AAKernelInfo>(*this, IRPosition::function(*K),
4314  if (!AA.SPMDCompatibilityTracker.isValidState())
4315  return indicatePessimisticFixpoint();
4316 
4317  if (AA.SPMDCompatibilityTracker.isAssumed()) {
4318  if (AA.SPMDCompatibilityTracker.isAtFixpoint())
4319  ++KnownSPMDCount;
4320  else
4321  ++AssumedSPMDCount;
4322  } else {
4323  if (AA.SPMDCompatibilityTracker.isAtFixpoint())
4324  ++KnownNonSPMDCount;
4325  else
4326  ++AssumedNonSPMDCount;
4327  }
4328  }
4329 
4330  if ((AssumedSPMDCount + KnownSPMDCount) &&
4331  (AssumedNonSPMDCount + KnownNonSPMDCount))
4332  return indicatePessimisticFixpoint();
4333 
4334  auto &Ctx = getAnchorValue().getContext();
4335  // If the caller can only be reached by SPMD kernel entries, the parallel
4336  // level is 1. Similarly, if the caller can only be reached by non-SPMD
4337  // kernel entries, it is 0.
4338  if (AssumedSPMDCount || KnownSPMDCount) {
4339  assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
4340  "Expected only SPMD kernels!");
4341  SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 1);
4342  } else {
4343  assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
4344  "Expected only non-SPMD kernels!");
4345  SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 0);
4346  }
4347  return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
4349  }
4350 
4351  ChangeStatus foldKernelFnAttribute(Attributor &A, llvm::StringRef Attr) {
4352  // Specialize only if all the calls agree with the attribute constant value
4353  int32_t CurrentAttrValue = -1;
4354  Optional<Value *> SimplifiedValueBefore = SimplifiedValue;
4355 
4356  auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
4357  *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
4358 
4359  if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState())
4360  return indicatePessimisticFixpoint();
4361 
4362  // Iterate over the kernels that reach this function
4363  for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) {
4364  int32_t NextAttrVal = -1;
4365  if (K->hasFnAttribute(Attr))
4366  NextAttrVal =
4367  std::stoi(K->getFnAttribute(Attr).getValueAsString().str());
4368 
4369  if (NextAttrVal == -1 ||
4370  (CurrentAttrValue != -1 && CurrentAttrValue != NextAttrVal))
4371  return indicatePessimisticFixpoint();
4372  CurrentAttrValue = NextAttrVal;
4373  }
4374 
4375  if (CurrentAttrValue != -1) {
4376  auto &Ctx = getAnchorValue().getContext();
4377  SimplifiedValue =
4378  ConstantInt::get(Type::getInt32Ty(Ctx), CurrentAttrValue);
4379  }
4380  return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
4382  }
4383 
4384  /// An optional value the associated value is assumed to fold to. That is, we
4385  /// assume the associated value (which is a call) can be replaced by this
4386  /// simplified value.
4387  Optional<Value *> SimplifiedValue;
4388 
4389  /// The runtime function kind of the callee of the associated call site.
4390  RuntimeFunction RFKind;
4391 };
4392 
4393 } // namespace
4394 
4395 /// Register folding callsite
4396 void OpenMPOpt::registerFoldRuntimeCall(RuntimeFunction RF) {
4397  auto &RFI = OMPInfoCache.RFIs[RF];
4398  RFI.foreachUse(SCC, [&](Use &U, Function &F) {
4399  CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &RFI);
4400  if (!CI)
4401  return false;
4402  A.getOrCreateAAFor<AAFoldRuntimeCall>(
4403  IRPosition::callsite_returned(*CI), /* QueryingAA */ nullptr,
4404  DepClassTy::NONE, /* ForceUpdate */ false,
4405  /* UpdateAfterInit */ false);
4406  return false;
4407  });
4408 }
4409 
4410 void OpenMPOpt::registerAAs(bool IsModulePass) {
4411  if (SCC.empty())
4412 
4413  return;
4414  if (IsModulePass) {
4415  // Ensure we create the AAKernelInfo AAs first and without triggering an
4416  // update. This will make sure we register all value simplification
4417  // callbacks before any other AA has the chance to create an AAValueSimplify
4418  // or similar.
4419  for (Function *Kernel : OMPInfoCache.Kernels)
4420  A.getOrCreateAAFor<AAKernelInfo>(
4421  IRPosition::function(*Kernel), /* QueryingAA */ nullptr,
4422  DepClassTy::NONE, /* ForceUpdate */ false,
4423  /* UpdateAfterInit */ false);
4424 
4425  registerFoldRuntimeCall(OMPRTL___kmpc_is_generic_main_thread_id);
4426  registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode);
4427  registerFoldRuntimeCall(OMPRTL___kmpc_parallel_level);
4428  registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_threads_in_block);
4429  registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_blocks);
4430  }
4431 
4432  // Create CallSite AA for all Getters.
4433  for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) {
4434  auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)];
4435 
4436  auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
4437 
4438  auto CreateAA = [&](Use &U, Function &Caller) {
4439  CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
4440  if (!CI)
4441  return false;
4442 
4443  auto &CB = cast<CallBase>(*CI);
4444 
4446  A.getOrCreateAAFor<AAICVTracker>(CBPos);
4447  return false;
4448  };
4449 
4450  GetterRFI.foreachUse(SCC, CreateAA);
4451  }
4452  auto &GlobalizationRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
4453  auto CreateAA = [&](Use &U, Function &F) {
4454  A.getOrCreateAAFor<AAHeapToShared>(IRPosition::function(F));
4455  return false;
4456  };
4458  GlobalizationRFI.foreachUse(SCC, CreateAA);
4459 
4460  // Create an ExecutionDomain AA for every function and a HeapToStack AA for
4461  // every function if there is a device kernel.
4462  if (!isOpenMPDevice(M))
4463  return;
4464 
4465  for (auto *F : SCC) {
4466  if (F->isDeclaration())
4467  continue;
4468 
4469  A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(*F));
4471  A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(*F));
4472 
4473  for (auto &I : instructions(*F)) {
4474  if (auto *LI = dyn_cast<LoadInst>(&I)) {
4475  bool UsedAssumedInformation = false;
4476  A.getAssumedSimplified(IRPosition::value(*LI), /* AA */ nullptr,
4477  UsedAssumedInformation);
4478  }
4479  }
4480  }
4481 }
4482 
4483 const char AAICVTracker::ID = 0;
4484 const char AAKernelInfo::ID = 0;
4485 const char AAExecutionDomain::ID = 0;
4486 const char AAHeapToShared::ID = 0;
4487 const char AAFoldRuntimeCall::ID = 0;
4488 
4489 AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP,
4490  Attributor &A) {
4491  AAICVTracker *AA = nullptr;
4492  switch (IRP.getPositionKind()) {
4494  case IRPosition::IRP_FLOAT:
4497  llvm_unreachable("ICVTracker can only be created for function position!");
4499  AA = new (A.Allocator) AAICVTrackerFunctionReturned(IRP, A);
4500  break;
4502  AA = new (A.Allocator) AAICVTrackerCallSiteReturned(IRP, A);
4503  break;
4505  AA = new (A.Allocator) AAICVTrackerCallSite(IRP, A);
4506  break;
4508  AA = new (A.Allocator) AAICVTrackerFunction(IRP, A);
4509  break;
4510  }
4511 
4512  return *AA;
4513 }
4514 
4516  Attributor &A) {
4517  AAExecutionDomainFunction *AA = nullptr;
4518  switch (IRP.getPositionKind()) {
4520  case IRPosition::IRP_FLOAT:
4527  "AAExecutionDomain can only be created for function position!");
4529  AA = new (A.Allocator) AAExecutionDomainFunction(IRP, A);
4530  break;
4531  }
4532 
4533  return *AA;
4534 }
4535 
4536 AAHeapToShared &AAHeapToShared::createForPosition(const IRPosition &IRP,
4537  Attributor &A) {
4538  AAHeapToSharedFunction *AA = nullptr;
4539  switch (IRP.getPositionKind()) {
4541  case IRPosition::IRP_FLOAT:
4548  "AAHeapToShared can only be created for function position!");
4550  AA = new (A.Allocator) AAHeapToSharedFunction(IRP, A);
4551  break;
4552  }
4553 
4554  return *AA;
4555 }
4556 
4557 AAKernelInfo &AAKernelInfo::createForPosition(const IRPosition &IRP,
4558  Attributor &A) {
4559  AAKernelInfo *AA = nullptr;
4560  switch (IRP.getPositionKind()) {
4562  case IRPosition::IRP_FLOAT:
4567  llvm_unreachable("KernelInfo can only be created for function position!");
4569  AA = new (A.Allocator) AAKernelInfoCallSite(IRP, A);
4570  break;
4572  AA = new (A.Allocator) AAKernelInfoFunction(IRP, A);
4573  break;
4574  }
4575 
4576  return *AA;
4577 }
4578 
4579 AAFoldRuntimeCall &AAFoldRuntimeCall::createForPosition(const IRPosition &IRP,
4580  Attributor &A) {
4581  AAFoldRuntimeCall *AA = nullptr;
4582  switch (IRP.getPositionKind()) {
4584  case IRPosition::IRP_FLOAT:
4590  llvm_unreachable("KernelInfo can only be created for call site position!");
4592  AA = new (A.Allocator) AAFoldRuntimeCallCallSiteReturned(IRP, A);
4593  break;
4594  }
4595 
4596  return *AA;
4597 }
4598 
4600  if (!containsOpenMP(M))
4601  return PreservedAnalyses::all();
4603  return PreservedAnalyses::all();
4604 
4608 
4609  auto IsCalled = [&](Function &F) {
4610  if (Kernels.contains(&F))
4611  return true;
4612  for (const User *U : F.users())
4613  if (!isa<BlockAddress>(U))
4614  return true;
4615  return false;
4616  };
4617 
4618  auto EmitRemark = [&](Function &F) {
4620  ORE.emit([&]() {
4621  OptimizationRemarkAnalysis ORA(DEBUG_TYPE, "OMP140", &F);
4622  return ORA << "Could not internalize function. "
4623  << "Some optimizations may not be possible. [OMP140]";
4624  });
4625  };
4626 
4627  // Create internal copies of each function if this is a kernel Module. This
4628  // allows iterprocedural passes to see every call edge.
4629  DenseMap<Function *, Function *> InternalizedMap;
4630  if (isOpenMPDevice(M)) {
4631  SmallPtrSet<Function *, 16> InternalizeFns;
4632  for (Function &F : M)
4633  if (!F.isDeclaration() && !Kernels.contains(&F) && IsCalled(F) &&
4636  InternalizeFns.insert(&F);
4637  } else if (!F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::Cold)) {
4638  EmitRemark(F);
4639  }
4640  }
4641 
4642  Attributor::internalizeFunctions(InternalizeFns, InternalizedMap);
4643  }
4644 
4645  // Look at every function in the Module unless it was internalized.
4647  for (Function &F : M)
4648  if (!F.isDeclaration() && !InternalizedMap.lookup(&F))
4649  SCC.push_back(&F);
4650 
4651  if (SCC.empty())
4652  return PreservedAnalyses::all();
4653 
4654  AnalysisGetter AG(FAM);
4655 
4656  auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & {
4658  };
4659 
4661  CallGraphUpdater CGUpdater;
4662 
4663  SetVector<Function *> Functions(SCC.begin(), SCC.end());
4664  OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ Functions, Kernels);
4665 
4666  unsigned MaxFixpointIterations =
4668  Attributor A(Functions, InfoCache, CGUpdater, nullptr, true, false,
4669  MaxFixpointIterations, OREGetter, DEBUG_TYPE);
4670 
4671  OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
4672  bool Changed = OMPOpt.run(true);
4673 
4674  // Optionally inline device functions for potentially better performance.
4676  for (Function &F : M)
4677  if (!F.isDeclaration() && !Kernels.contains(&F) &&
4678  !F.hasFnAttribute(Attribute::NoInline))
4679  F.addFnAttr(Attribute::AlwaysInline);
4680 
4682  LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt Module Pass:\n" << M);
4683 
4684  if (Changed)
4685  return PreservedAnalyses::none();
4686 
4687  return PreservedAnalyses::all();
4688 }
4689 
4692  LazyCallGraph &CG,
4693  CGSCCUpdateResult &UR) {
4694  if (!containsOpenMP(*C.begin()->getFunction().getParent()))
4695  return PreservedAnalyses::all();
4697  return PreservedAnalyses::all();
4698 
4700  // If there are kernels in the module, we have to run on all SCC's.
4701  for (LazyCallGraph::Node &N : C) {
4702  Function *Fn = &N.getFunction();
4703  SCC.push_back(Fn);
4704  }
4705 
4706  if (SCC.empty())
4707  return PreservedAnalyses::all();
4708 
4709  Module &M = *C.begin()->getFunction().getParent();
4710 
4712 
4714  AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
4715 
4716  AnalysisGetter AG(FAM);
4717 
4718  auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & {
4720  };
4721 
4723  CallGraphUpdater CGUpdater;
4724  CGUpdater.initialize(CG, C, AM, UR);
4725 
4726  SetVector<Function *> Functions(SCC.begin(), SCC.end());
4727  OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
4728  /*CGSCC*/ Functions, Kernels);
4729 
4730  unsigned MaxFixpointIterations =
4732  Attributor A(Functions, InfoCache, CGUpdater, nullptr, false, true,
4733  MaxFixpointIterations, OREGetter, DEBUG_TYPE);
4734 
4735  OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
4736  bool Changed = OMPOpt.run(false);
4737 
4739  LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt CGSCC Pass:\n" << M);
4740 
4741  if (Changed)
4742  return PreservedAnalyses::none();
4743 
4744  return PreservedAnalyses::all();
4745 }
4746 
4747 namespace {
4748 
4749 struct OpenMPOptCGSCCLegacyPass : public CallGraphSCCPass {
4750  CallGraphUpdater CGUpdater;
4751  static char ID;
4752 
4753  OpenMPOptCGSCCLegacyPass() : CallGraphSCCPass(ID) {
4755  }
4756 
4757  void getAnalysisUsage(AnalysisUsage &AU) const override {
4759  }
4760 
4761  bool runOnSCC(CallGraphSCC &CGSCC) override {
4762  if (!containsOpenMP(CGSCC.getCallGraph().getModule()))
4763  return false;
4764  if (DisableOpenMPOptimizations || skipSCC(CGSCC))
4765  return false;
4766 
4768  // If there are kernels in the module, we have to run on all SCC's.
4769  for (CallGraphNode *CGN : CGSCC) {
4770  Function *Fn = CGN->getFunction();
4771  if (!Fn || Fn->isDeclaration())
4772  continue;
4773  SCC.push_back(Fn);
4774  }
4775 
4776  if (SCC.empty())
4777  return false;
4778 
4779  Module &M = CGSCC.getCallGraph().getModule();
4781 
4782  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
4783  CGUpdater.initialize(CG, CGSCC);
4784 
4785  // Maintain a map of functions to avoid rebuilding the ORE
4787  auto OREGetter = [&OREMap](Function *F) -> OptimizationRemarkEmitter & {
4788  std::unique_ptr<OptimizationRemarkEmitter> &ORE = OREMap[F];
4789  if (!ORE)
4790  ORE = std::make_unique<OptimizationRemarkEmitter>(F);
4791  return *ORE;
4792  };
4793 
4794  AnalysisGetter AG;
4795  SetVector<Function *> Functions(SCC.begin(), SCC.end());
4797  OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG,
4798  Allocator,
4799  /*CGSCC*/ Functions, Kernels);
4800 
4801  unsigned MaxFixpointIterations =
4803  Attributor A(Functions, InfoCache, CGUpdater, nullptr, false, true,
4804  MaxFixpointIterations, OREGetter, DEBUG_TYPE);
4805 
4806  OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
4807  bool Result = OMPOpt.run(false);
4808 
4810  LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt CGSCC Pass:\n" << M);
4811 
4812  return Result;
4813  }
4814 
4815  bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); }
4816 };
4817 
4818 } // end anonymous namespace
4819 
4821  // TODO: Create a more cross-platform way of determining device kernels.
4822  NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
4824 
4825  if (!MD)
4826  return Kernels;
4827 
4828  for (auto *Op : MD->operands()) {
4829  if (Op->getNumOperands() < 2)
4830  continue;
4831  MDString *KindID = dyn_cast<MDString>(Op->getOperand(1));
4832  if (!KindID || KindID->getString() != "kernel")
4833  continue;
4834 
4835  Function *KernelFn =
4836  mdconst::dyn_extract_or_null<Function>(Op->getOperand(0));
4837  if (!KernelFn)
4838  continue;
4839 
4840  ++NumOpenMPTargetRegionKernels;
4841 
4842  Kernels.insert(KernelFn);
4843  }
4844 
4845  return Kernels;
4846 }
4847 
4849  Metadata *MD = M.getModuleFlag("openmp");
4850  if (!MD)
4851  return false;
4852 
4853  return true;
4854 }
4855 
4857  Metadata *MD = M.getModuleFlag("openmp-device");
4858  if (!MD)
4859  return false;
4860 
4861  return true;
4862 }
4863 
4865 
4866 INITIALIZE_PASS_BEGIN(OpenMPOptCGSCCLegacyPass, "openmp-opt-cgscc",
4867  "OpenMP specific optimizations", false, false)
4869 INITIALIZE_PASS_END(OpenMPOptCGSCCLegacyPass, "openmp-opt-cgscc",
4870  "OpenMP specific optimizations", false, false)
4871 
4873  return new OpenMPOptCGSCCLegacyPass();
4874 }
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:155
llvm::CallGraphUpdater
Wrapper to unify "old style" CallGraph and "new style" LazyCallGraph.
Definition: CallGraphUpdater.h:28
llvm::Argument
This class represents an incoming formal argument to a Function.
Definition: Argument.h:29
llvm::IRPosition::function
static const IRPosition function(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the function scope of F.
Definition: Attributor.h:339
llvm::BasicBlock::end
iterator end()
Definition: BasicBlock.h:298
llvm::StringRef::startswith
LLVM_NODISCARD bool startswith(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:285
llvm::OptimizationRemarkMissed
Diagnostic information for missed-optimization remarks.
Definition: DiagnosticInfo.h:730
llvm::predecessors
pred_range predecessors(BasicBlock *BB)
Definition: CFG.h:127
llvm::OpenMPIRBuilder::LocationDescription
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
Definition: OMPIRBuilder.h:150
getName
static StringRef getName(Value *V)
Definition: ProvenanceAnalysisEvaluator.cpp:42
SetFixpointIterations
static cl::opt< unsigned > SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden, cl::desc("Maximal number of attributor iterations."), cl::init(256))
Merge
R600 Clause Merge
Definition: R600ClauseMergePass.cpp:69
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AllocatorList.h:23
BlockSize
static const int BlockSize
Definition: TarWriter.cpp:33
llvm::CastInst::CreatePointerBitCastOrAddrSpaceCast
static CastInst * CreatePointerBitCastOrAddrSpaceCast(Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd)
Create a BitCast or an AddrSpaceCast cast instruction.
Definition: Instructions.cpp:3286
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::Instruction::getModule
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:66
llvm::CmpInst::ICMP_EQ
@ ICMP_EQ
equal
Definition: InstrTypes.h:742
llvm::NamedMDNode
A tuple of MDNodes.
Definition: Metadata.h:1421
llvm::drop_begin
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:266
OpenMPOpt.h
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:633
llvm::Type::getInt8PtrTy
static PointerType * getInt8PtrTy(LLVMContext &C, unsigned AS=0)
Definition: Type.cpp:293
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
IntrinsicInst.h
llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:783
llvm::Function
Definition: Function.h:62
llvm::Attribute
Definition: Attributes.h:51
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::lookup
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:197
StringRef.h
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
DisableOpenMPOptSPMDization
static cl::opt< bool > DisableOpenMPOptSPMDization("openmp-opt-disable-spmdization", cl::ZeroOrMore, cl::desc("Disable OpenMP optimizations involving SPMD-ization."), cl::Hidden, cl::init(false))
TAG
static constexpr auto TAG
Definition: OpenMPOpt.cpp:151
llvm::raw_string_ostream
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:625
llvm::SetVector< T, SmallVector< T, N >, SmallDenseSet< T, N > >::size
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:77
contains
return AArch64::GPR64RegClass contains(Reg)
llvm::toString
std::string toString(Error E)
Write all error messages (if any) in E to a string.
Definition: Error.h:1030
llvm::GlobalValue::NotThreadLocal
@ NotThreadLocal
Definition: GlobalValue.h:179
llvm::CallingConv::Cold
@ Cold
Definition: CallingConv.h:48
llvm::ilist_node_with_parent::getNextNode
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:288
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
Statistic.h
llvm::initializeOpenMPOptCGSCCLegacyPassPass
void initializeOpenMPOptCGSCCLegacyPassPass(PassRegistry &)
llvm::IRPosition::value
static const IRPosition value(const Value &V, const CallBaseContext *CBContext=nullptr)
Create a position describing the value of V.
Definition: Attributor.h:328
llvm::OpenMPOptCGSCCPass::run
PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM, LazyCallGraph &CG, CGSCCUpdateResult &UR)
Definition: OpenMPOpt.cpp:4690
llvm::Type::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:734
llvm::Function::getEntryBlock
const BasicBlock & getEntryBlock() const
Definition: Function.h:707
llvm::OpenMPIRBuilder::InsertPointTy
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
Definition: OMPIRBuilder.h:53
llvm::omp::OMPScheduleType::DistributeChunked
@ DistributeChunked
llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2657
llvm::StateWrapper
Helper to tie a abstract state implementation to an abstract attribute.
Definition: Attributor.h:2658
llvm::GlobalVariable
Definition: GlobalVariable.h:40
llvm::SmallDenseMap
Definition: DenseMap.h:880
llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition: InstrTypes.h:743
llvm::FunctionType::get
static FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
Definition: Type.cpp:363
ValueTracking.h
OptimizationRemarkEmitter.h
llvm::CallGraph
The basic data container for the call graph of a Module of IR.
Definition: CallGraph.h:73
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:151
FAM
FunctionAnalysisManager FAM
Definition: PassBuilderBindings.cpp:59
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
DisableOpenMPOptFolding
static cl::opt< bool > DisableOpenMPOptFolding("openmp-opt-disable-folding", cl::ZeroOrMore, cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden, cl::init(false))
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::begin
iterator begin()
Definition: DenseMap.h:74
llvm::PreservedAnalyses::none
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: PassManager.h:158
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::sys::path::end
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:235
llvm::sys::path::begin
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
llvm::CallBase::isCallee
bool isCallee(Value::const_user_iterator UI) const
Determine whether the passed iterator points to the callee operand's Use.
Definition: InstrTypes.h:1406
llvm::Use::getOperandNo
unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition: Use.cpp:33
llvm::BasicBlock::splitBasicBlock
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:385
llvm::operator!=
bool operator!=(uint64_t V1, const APInt &V2)
Definition: APInt.h:1983
llvm::InformationCache
Data structure to hold cached (LLVM-IR) information.
Definition: Attributor.h:865
llvm::Optional
Definition: APInt.h:33
Offset
uint64_t Offset
Definition: ELFObjHandler.cpp:81
llvm::SmallPtrSet< Instruction *, 4 >
llvm::ore::NV
DiagnosticInfoOptimizationBase::Argument NV
Definition: OptimizationRemarkEmitter.h:136
llvm::tgtok::FalseVal
@ FalseVal
Definition: TGLexer.h:61
CodeExtractor.h
llvm::IRPosition::IRP_ARGUMENT
@ IRP_ARGUMENT
An attribute for a function argument.
Definition: Attributor.h:318
llvm::IRPosition::IRP_RETURNED
@ IRP_RETURNED
An attribute for the function return value.
Definition: Attributor.h:314
llvm::MipsISD::Ret
@ Ret
Definition: MipsISelLowering.h:116
initialize
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, ArrayRef< StringLiteral > StandardNames)
Initialize the set of available library functions based on the specified target triple.
Definition: TargetLibraryInfo.cpp:116
llvm::AbstractCallSite
AbstractCallSite.
Definition: AbstractCallSite.h:50
llvm::SmallVectorImpl::pop_back_val
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:635
llvm::GlobalValue::LinkageTypes
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition: GlobalValue.h:47
llvm::AAExecutionDomain::ID
static const char ID
Unique ID (due to the unique address)
Definition: Attributor.h:4578
llvm::CallBase::addParamAttr
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1525
llvm::Type::getInt8Ty
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:239
llvm::IRPosition::IRP_FLOAT
@ IRP_FLOAT
A position that is not associated with a spot suitable for attributes.
Definition: Attributor.h:312
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:241
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::Instruction::mayHaveSideEffects
bool mayHaveSideEffects() const
Return true if the instruction may have side effects.
Definition: Instruction.cpp:689
PrintICVValues
static cl::opt< bool > PrintICVValues("openmp-print-icv-values", cl::init(false), cl::Hidden)
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::IntegerStateBase< bool, 1, 0 >::operator^=
void operator^=(const IntegerStateBase< base_t, BestState, WorstState > &R)
"Clamp" this state with R.
Definition: Attributor.h:2174
llvm::ConstantExpr::getPointerCast
static Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
Definition: Constants.cpp:2045
llvm::BasicBlock::getUniqueSuccessor
const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
Definition: BasicBlock.cpp:306
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::AMDGPU::isKernel
LLVM_READNONE bool isKernel(CallingConv::ID CC)
Definition: AMDGPUBaseInfo.h:723
llvm::Optional::hasValue
constexpr bool hasValue() const
Definition: Optional.h:288
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::CallGraphUpdater::finalize
bool finalize()
}
Definition: CallGraphUpdater.cpp:21
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:206
Instruction.h
llvm::AMDGPU::HSAMD::Key::Kernels
constexpr char Kernels[]
Key for HSA::Metadata::mKernels.
Definition: AMDGPUMetadata.h:427
CommandLine.h
Printer
print alias Alias Set Printer
Definition: AliasSetTracker.cpp:762
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1581
llvm::operator&=
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
Definition: SparseBitVector.h:834
GlobalValue.h
llvm::CallGraphSCC
CallGraphSCC - This is a single SCC that a CallGraphSCCPass is run on.
Definition: CallGraphSCCPass.h:87
llvm::AddrSpaceCastInst
This class represents a conversion between pointers from one address space to another.
Definition: Instructions.h:5267
llvm::SetVector::begin
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:82
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:31
llvm::LazyCallGraph::SCC
An SCC of the call graph.
Definition: LazyCallGraph.h:422
llvm::GlobalValue::isDeclaration
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:228
llvm::MutableArrayRef
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:305
OMPIRBuilder.h
llvm::omp::OMP_TGT_EXEC_MODE_GENERIC
@ OMP_TGT_EXEC_MODE_GENERIC
Definition: OMPConstants.h:136
llvm::ChangeStatus
ChangeStatus
{
Definition: Attributor.h:210
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::CallGraphUpdater::removeCallSite
void removeCallSite(CallBase &CS)
Remove the call site CS from the call graph.
Definition: CallGraphUpdater.cpp:160
llvm::OpenMPIRBuilder
An interface to create LLVM-IR for OpenMP directives.
Definition: OMPIRBuilder.h:29
llvm::User
Definition: User.h:44
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(OpenMPOptCGSCCLegacyPass, "openmp-opt-cgscc", "OpenMP specific optimizations", false, false) INITIALIZE_PASS_END(OpenMPOptCGSCCLegacyPass
llvm::AAExecutionDomain::isExecutedByInitialThreadOnly
virtual bool isExecutedByInitialThreadOnly(const Instruction &) const =0
Check if an instruction is executed only by the initial thread.
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::ARM_PROC::A
@ A
Definition: ARMBaseInfo.h:34
llvm::IntegerStateBase< bool, 1, 0 >::operator==
bool operator==(const IntegerStateBase< base_t, BestState, WorstState > &R) const
Equality for IntegerStateBase.
Definition: Attributor.h:2160
llvm::CallBase::getCalledFunction
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation.
Definition: InstrTypes.h:1398
HideMemoryTransferLatency
static cl::opt< bool > HideMemoryTransferLatency("openmp-hide-memory-transfer-latency", cl::desc("[WIP] Tries to hide the latency of host to device memory" " transfers"), cl::Hidden, cl::init(false))
llvm::CallInst::Create
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Definition: Instructions.h:1530
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::ms_demangle::QualifierMangleMode::Result
@ Result
llvm::EnumeratedArray
Definition: EnumeratedArray.h:23
IP
Definition: NVPTXLowerArgs.cpp:166
llvm::BasicBlock::getFirstInsertionPt
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:253
false
Definition: StackSlotColoring.cpp:142
llvm::AArch64CC::HS
@ HS
Definition: AArch64BaseInfo.h:257
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:109
opt
arm prera ldst opt
Definition: ARMLoadStoreOptimizer.cpp:2189
llvm::Function::getFnAttribute
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:652
llvm::pdb::PDB_SymType::Caller
@ Caller
llvm::Instruction
Definition: Instruction.h:45
llvm::IRPosition::getPositionKind
Kind getPositionKind() const
Return the associated position kind.
Definition: Attributor.h:536
llvm::DepClassTy::OPTIONAL
@ OPTIONAL
The target may be valid if the source is not.
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
llvm::Value::setName
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:376
llvm::UndefValue::get
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1796
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:925
llvm::Use::getUser
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:73
llvm::getUnderlyingObject
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
Definition: ValueTracking.cpp:4397
llvm::CallGraphNode
A node in the call graph for a module.
Definition: CallGraph.h:167
AlwaysInlineDeviceFunctions
static cl::opt< bool > AlwaysInlineDeviceFunctions("openmp-opt-inline-device", cl::ZeroOrMore, cl::desc("Inline all applicible functions on the device."), cl::Hidden, cl::init(false))
llvm::omp::isOpenMPDevice
bool isOpenMPDevice(Module &M)
Helper to determine if M is a OpenMP target offloading device module.
Definition: OpenMPOpt.cpp:4856
llvm::GlobalValue::InternalLinkage
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:55
llvm::MCID::Call
@ Call
Definition: MCInstrDesc.h:153
llvm::Metadata
Root of the metadata hierarchy.
Definition: Metadata.h:62
llvm::Instruction::mayWriteToMemory
bool mayWriteToMemory() const
Return true if this instruction may modify memory.
Definition: Instruction.cpp:590
llvm::AddressSpace
AddressSpace
Definition: NVPTXBaseInfo.h:21
llvm::StringRef::str
LLVM_NODISCARD std::string str() const
str - Get the contents as an std::string.
Definition: StringRef.h:244
llvm::Attributor::SimplifictionCallbackTy
std::function< Optional< Value * >(const IRPosition &, const AbstractAttribute *, bool &)> SimplifictionCallbackTy
Register CB as a simplification callback.
Definition: Attributor.h:1560
llvm::Instruction::isLifetimeStartOrEnd
bool isLifetimeStartOrEnd() const
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
Definition: Instruction.cpp:712
llvm::OpenMPIRBuilder::addAttributes
void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Definition: OMPIRBuilder.cpp:59
llvm::Attribute::getValueAsString
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:301
llvm::None
const NoneType None
Definition: None.h:23
llvm::lltok::Kind
Kind
Definition: LLToken.h:18
llvm::CallBase::getCaller
Function * getCaller()
Helper to get the caller (the parent function).
Definition: Instructions.cpp:282
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
optimizations
openmp opt OpenMP specific optimizations
Definition: OpenMPOpt.cpp:4870
llvm::BranchInst::getCondition
Value * getCondition() const
Definition: Instructions.h:3182
Assumptions.h
llvm::ChangeStatus::UNCHANGED
@ UNCHANGED
llvm::omp::InternalControlVar
InternalControlVar
IDs for all Internal Control Variables (ICVs).
Definition: OMPConstants.h:35
PrintOpenMPKernels
static cl::opt< bool > PrintOpenMPKernels("openmp-print-gpu-kernels", cl::init(false), cl::Hidden)
llvm::BitmaskEnumDetail::operator^=
E & operator^=(E &LHS, E RHS)
Definition: BitmaskEnum.h:137
llvm::cl::ZeroOrMore
@ ZeroOrMore
Definition: CommandLine.h:120
llvm::function_ref
An efficient, type-erasing, non-owning reference to a callable.
Definition: STLExtras.h:168
llvm::omp::RuntimeFunction
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
Definition: OMPConstants.h:54
llvm::DenseSet
Implements a dense probed hash-table based set.
Definition: DenseSet.h:268
llvm::HighlightColor::Remark
@ Remark
llvm::Function::hasFnAttribute
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:626
llvm::cl::opt< bool >
Attributor.h
llvm::Clause
Definition: DirectiveEmitter.h:123
llvm::IRPosition::callsite_argument
static const IRPosition callsite_argument(const CallBase &CB, unsigned ArgNo)
Create a position describing the argument of CB at position ArgNo.
Definition: Attributor.h:369
llvm::instructions
inst_range instructions(Function *F)
Definition: InstIterator.h:133
llvm::StoreInst
An instruction for storing to memory.
Definition: Instructions.h:304
llvm::GlobalVariable::getInitializer
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
Definition: GlobalVariable.h:136
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::getUnderlyingObjects
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
Definition: ValueTracking.cpp:4442
DisableOpenMPOptStateMachineRewrite
static cl::opt< bool > DisableOpenMPOptStateMachineRewrite("openmp-opt-disable-state-machine-rewrite", cl::ZeroOrMore, cl::desc("Disable OpenMP optimizations that replace the state machine."), cl::Hidden, cl::init(false))
llvm::Instruction::eraseFromParent
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:78
llvm::NONE
@ NONE
Do not track a dependence between source and target.
llvm::omp::OMPScheduleType::Distribute
@ Distribute
llvm::AbstractAttribute
Base struct for all "concrete attribute" deductions.
Definition: Attributor.h:2763
llvm::DenseMapBase< SmallDenseMap< KeyT, ValueT, 4, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::clear
void clear()
Definition: DenseMap.h:111
llvm::CallGraphSCCPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &Info) const override
getAnalysisUsage - For this class, we declare that we require and preserve the call graph.
Definition: CallGraphSCCPass.cpp:659
llvm::count
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1664
uint64_t
llvm::DepClassTy::REQUIRED
@ REQUIRED
The target cannot be valid if the source is not.
llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD
@ OMP_TGT_EXEC_MODE_GENERIC_SPMD
Definition: OMPConstants.h:138
llvm::omp::Kernel
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition: OpenMPOpt.h:21
llvm::MemorySSAUpdater
Definition: MemorySSAUpdater.h:56
llvm::GlobalValue::getParent
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:578
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
IPO.h
Generic
@ Generic
Definition: AArch64MCAsmInfo.cpp:23
llvm::BumpPtrAllocatorImpl
Allocate memory in an ever growing pool, as if by bump-pointer.
Definition: Allocator.h:67
llvm::IRPosition::IRP_FUNCTION
@ IRP_FUNCTION
An attribute for a function (scope).
Definition: Attributor.h:316
llvm::IRPosition::getAnchorValue
Value & getAnchorValue() const
Return the value this abstract attribute is anchored with.
Definition: Attributor.h:413
llvm::BranchInst::Create
static BranchInst * Create(BasicBlock *IfTrue, Instruction *InsertBefore=nullptr)
Definition: Instructions.h:3157
llvm::DenseMap
Definition: DenseMap.h:714
llvm::CallGraphWrapperPass
The ModulePass which wraps up a CallGraph and the logic to build it.
Definition: CallGraph.h:337
llvm::AAAssumptionInfo
An abstract attribute for getting assumption information.
Definition: Attributor.h:4747
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::FunctionType::getParamType
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:135
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:441
llvm::SmallPtrSetImpl::begin
iterator begin() const
Definition: SmallPtrSet.h:402
llvm::Instruction::setDebugLoc
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:367
llvm::omp::OMPScheduleType::Static
@ Static
llvm::ilist_node_with_parent::getPrevNode
NodeTy * getPrevNode()
Definition: ilist_node.h:274
llvm::AbstractState
An interface to query the internal state of an abstract attribute.
Definition: Attributor.h:2076
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
EnableParallelRegionMerging
static cl::opt< bool > EnableParallelRegionMerging("openmp-opt-enable-merging", cl::ZeroOrMore, cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden, cl::init(false))
llvm::CallBase::hasOperandBundles
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:1910
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::User::replaceUsesOfWith
void replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
llvm::operator==
bool operator==(uint64_t V1, const APInt &V2)
Definition: APInt.h:1981
llvm::IRPosition
Helper to describe and deal with positions in the LLVM-IR.
Definition: Attributor.h:304
llvm::CallGraphUpdater::registerOutlinedFunction
void registerOutlinedFunction(Function &OriginalFn, Function &NewFn)
If a new function was created by outlining, this method can be called to update the call graph for th...
Definition: CallGraphUpdater.cpp:99
Mode
SI Whole Quad Mode
Definition: SIWholeQuadMode.cpp:262
llvm::omp::ICVInitValue
ICVInitValue
Definition: OMPConstants.h:44
llvm::ArrayType::get
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:640
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
EnableVerboseRemarks
static cl::opt< bool > EnableVerboseRemarks("openmp-opt-verbose-remarks", cl::ZeroOrMore, cl::desc("Enables more verbose remarks."), cl::Hidden, cl::init(false))
llvm::LazyCallGraph::Node
A node in the call graph.
Definition: LazyCallGraph.h:318
llvm::ARM::WinEH::ReturnType
ReturnType
Definition: ARMWinEH.h:25
llvm::Function::getFunction
const Function & getFunction() const
Definition: Function.h:137
llvm::SetVector< T, SmallVector< T, N >, SmallDenseSet< T, N > >::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:141
llvm::CmpInst::ICMP_SLT
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:750
llvm::size
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1562
llvm::AMDGPU::CPol::SCC
@ SCC
Definition: SIDefines.h:295
llvm::IRPosition::IRP_CALL_SITE_RETURNED
@ IRP_CALL_SITE_RETURNED
An attribute for a call site return value.
Definition: Attributor.h:315
llvm::omp::getDeviceKernels
KernelSet getDeviceKernels(Module &M)
Get OpenMP device kernels in M.
Definition: OpenMPOpt.cpp:4820
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::LoopInfo
Definition: LoopInfo.h:1083
llvm::CGSCC
@ CGSCC
Definition: Attributor.h:4786
llvm::IRPosition::IRP_INVALID
@ IRP_INVALID
An invalid position.
Definition: Attributor.h:311
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:57
llvm::Instruction::setSuccessor
void setSuccessor(unsigned Idx, BasicBlock *BB)
Update the specified successor to point at the provided block.
Definition: Instruction.cpp:795
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:134
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
if
if(llvm_vc STREQUAL "") set(fake_version_inc "$
Definition: CMakeLists.txt:14
llvm::ConstantInt::isZero
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:194
llvm::Instruction::getFunction
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:70
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:532
EnumeratedArray.h
llvm::BasicBlock::Create
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:100
getParent
static const Function * getParent(const Value *V)
Definition: BasicAliasAnalysis.cpp:870
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:991
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:81
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
S
add sub stmia L5 ldr r0 bl L_printf $stub Instead of a and a wouldn t it be better to do three moves *Return an aggregate type is even return S
Definition: README.txt:210
llvm::pred_empty
bool pred_empty(const BasicBlock *BB)
Definition: CFG.h:119
llvm::AAExecutionDomain
Definition: Attributor.h:4550
llvm::ChangeStatus::CHANGED
@ CHANGED
DisableInternalization
static cl::opt< bool > DisableInternalization("openmp-opt-disable-internalization", cl::ZeroOrMore, cl::desc("Disable function internalization."), cl::Hidden, cl::init(false))
llvm::ConstantInt::getSExtValue
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:148
llvm::Attributor::isInternalizable
static bool isInternalizable(Function &F)
Returns true if the function F can be internalized.
Definition: Attributor.cpp:1951
llvm::Instruction::mayReadFromMemory
bool mayReadFromMemory() const
Return true if this instruction may read memory.
Definition: Instruction.cpp:570
llvm::OptimizationRemarkAnalysis
Diagnostic information for optimization analysis remarks.
Definition: DiagnosticInfo.h:776
CallGraphSCCPass.h
llvm::Value::getName
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:175
llvm::DenseMapBase< SmallDenseMap< KeyT, ValueT, 4, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:207
llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:152
llvm::omp::containsOpenMP
bool containsOpenMP(Module &M)
Helper to determine if M contains OpenMP.
Definition: OpenMPOpt.cpp:4848
llvm::IRPosition::IRP_CALL_SITE_ARGUMENT
@ IRP_CALL_SITE_ARGUMENT
An attribute for a call site argument.
Definition: Attributor.h:319
llvm::Module::getGlobalVariable
GlobalVariable * getGlobalVariable(StringRef Name) const
Look up the specified global variable in the module symbol table.
Definition: Module.h:411
llvm::NamedMDNode::operands
iterator_range< op_iterator > operands()
Definition: Metadata.h:1517
llvm::Value::stripPointerCasts
const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition: Value.cpp:685
llvm::SmallPtrSetImplBase::size
size_type size() const
Definition: SmallPtrSet.h:92
llvm::ConstantInt::getZExtValue
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:142
llvm::BasicBlock::front
const Instruction & front() const
Definition: BasicBlock.h:308
Callee
amdgpu Simplify well known AMD library false FunctionCallee Callee
Definition: AMDGPULibCalls.cpp:206
llvm::CGSCCUpdateResult
Support structure for SCC passes to communicate updates the call graph back to the CGSCC pass manager...
Definition: CGSCCPassManager.h:238
cgscc
openmp opt cgscc
Definition: OpenMPOpt.cpp:4869
llvm::SetState::indicateOptimisticFixpoint
ChangeStatus indicateOptimisticFixpoint() override
See AbstractState::indicateOptimisticFixpoint(...)
Definition: Attributor.h:2597
llvm::empty
constexpr bool empty(const T &RangeOrContainer)
Test whether RangeOrContainer is empty. Similar to C++17 std::empty.
Definition: STLExtras.h:254
llvm::ConstantInt::getTrue
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:873
llvm::IRPosition::IRP_CALL_SITE
@ IRP_CALL_SITE
An attribute for a call site (function scope).
Definition: Attributor.h:317
llvm::GraphProgram::Name
Name
Definition: GraphWriter.h:52
std
Definition: BitVector.h:838
llvm::Constant::getNullValue
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:348
llvm::OpenMPOptPass::run
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
Definition: OpenMPOpt.cpp:4599
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::end
iterator end()
Definition: DenseMap.h:83
llvm::SmallVectorImpl::assign
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:669
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:324
llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:161
llvm::X86::FirstMacroFusionInstKind::Cmp
@ Cmp
llvm::ConstantInt::getBool
static ConstantInt * getBool(LLVMContext &Context, bool V)
Definition: Constants.cpp:887
llvm::Function::getArg
Argument * getArg(unsigned i) const
Definition: Function.h:756
llvm::ConstantInt::getSigned
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.cpp:939
DiagnosticInfo.h
llvm::CallGraphSCCPass
Definition: CallGraphSCCPass.h:34
llvm::SetVector< T, SmallVector< T, N >, SmallDenseSet< T, N > >::count
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:215
llvm::Type::getPointerTo
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:776
llvm::Attributor
The fixpoint analysis framework that orchestrates the attribute deduction.
Definition: Attributor.h:1140
llvm::ReturnInst::Create
static ReturnInst * Create(LLVMContext &C, Value *retVal=nullptr, Instruction *InsertBefore=nullptr)
Definition: Instructions.h:3046
DisableOpenMPOptDeglobalization
static cl::opt< bool > DisableOpenMPOptDeglobalization("openmp-opt-disable-deglobalization", cl::ZeroOrMore, cl::desc("Disable OpenMP optimizations involving deglobalization."), cl::Hidden, cl::init(false))
llvm::GetPointerBaseWithConstantOffset
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
Definition: ValueTracking.h:288
llvm::IRPosition::callsite_function
static const IRPosition callsite_function(const CallBase &CB)
Create a position describing the function scope of CB.
Definition: Attributor.h:359
llvm::SmallVectorImpl::clear
void clear()
Definition: SmallVector.h:585
llvm::omp::OMP_TGT_EXEC_MODE_SPMD
@ OMP_TGT_EXEC_MODE_SPMD
Definition: OMPConstants.h:137
llvm::ReversePostOrderTraversal
Definition: PostOrderIterator.h:290
OMPConstants.h
llvm::AAHeapToStack
Definition: Attributor.h:3754
llvm::CmpInst::Create
static CmpInst * Create(OtherOps Op, Predicate predicate, Value *S1, Value *S2, const Twine &Name="", Instruction *InsertBefore=nullptr)
Construct a compare instruction, given the opcode, the p