clang  7.0.0
CGOpenMPRuntimeNVPTX.cpp
Go to the documentation of this file.
1 //===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This provides a class for OpenMP runtime code generation specialized to NVPTX
11 // targets.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "CGOpenMPRuntimeNVPTX.h"
16 #include "CodeGenFunction.h"
17 #include "clang/AST/DeclOpenMP.h"
18 #include "clang/AST/StmtOpenMP.h"
19 #include "clang/AST/StmtVisitor.h"
20 #include "llvm/ADT/SmallPtrSet.h"
21 
22 using namespace clang;
23 using namespace CodeGen;
24 
25 namespace {
27  /// Call to void __kmpc_kernel_init(kmp_int32 thread_limit,
28  /// int16_t RequiresOMPRuntime);
29  OMPRTL_NVPTX__kmpc_kernel_init,
30  /// Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
31  OMPRTL_NVPTX__kmpc_kernel_deinit,
32  /// Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
33  /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
34  OMPRTL_NVPTX__kmpc_spmd_kernel_init,
35  /// Call to void __kmpc_spmd_kernel_deinit();
36  OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,
37  /// Call to void __kmpc_kernel_prepare_parallel(void
38  /// *outlined_function, int16_t
39  /// IsOMPRuntimeInitialized);
40  OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
41  /// Call to bool __kmpc_kernel_parallel(void **outlined_function,
42  /// int16_t IsOMPRuntimeInitialized);
43  OMPRTL_NVPTX__kmpc_kernel_parallel,
44  /// Call to void __kmpc_kernel_end_parallel();
45  OMPRTL_NVPTX__kmpc_kernel_end_parallel,
46  /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
47  /// global_tid);
48  OMPRTL_NVPTX__kmpc_serialized_parallel,
49  /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
50  /// global_tid);
51  OMPRTL_NVPTX__kmpc_end_serialized_parallel,
52  /// Call to int32_t __kmpc_shuffle_int32(int32_t element,
53  /// int16_t lane_offset, int16_t warp_size);
54  OMPRTL_NVPTX__kmpc_shuffle_int32,
55  /// Call to int64_t __kmpc_shuffle_int64(int64_t element,
56  /// int16_t lane_offset, int16_t warp_size);
57  OMPRTL_NVPTX__kmpc_shuffle_int64,
58  /// Call to __kmpc_nvptx_parallel_reduce_nowait(kmp_int32
59  /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
60  /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
61  /// lane_offset, int16_t shortCircuit),
62  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
63  OMPRTL_NVPTX__kmpc_parallel_reduce_nowait,
64  /// Call to __kmpc_nvptx_simd_reduce_nowait(kmp_int32
65  /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
66  /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
67  /// lane_offset, int16_t shortCircuit),
68  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
69  OMPRTL_NVPTX__kmpc_simd_reduce_nowait,
70  /// Call to __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
71  /// int32_t num_vars, size_t reduce_size, void *reduce_data,
72  /// void (*kmp_ShuffleReductFctPtr)(void *rhs, int16_t lane_id, int16_t
73  /// lane_offset, int16_t shortCircuit),
74  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
75  /// void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
76  /// int32_t index, int32_t width),
77  /// void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad, int32_t
78  /// index, int32_t width, int32_t reduce))
79  OMPRTL_NVPTX__kmpc_teams_reduce_nowait,
80  /// Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
81  OMPRTL_NVPTX__kmpc_end_reduce_nowait,
82  /// Call to void __kmpc_data_sharing_init_stack();
83  OMPRTL_NVPTX__kmpc_data_sharing_init_stack,
84  /// Call to void __kmpc_data_sharing_init_stack_spmd();
85  OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd,
86  /// Call to void* __kmpc_data_sharing_push_stack(size_t size,
87  /// int16_t UseSharedMemory);
88  OMPRTL_NVPTX__kmpc_data_sharing_push_stack,
89  /// Call to void __kmpc_data_sharing_pop_stack(void *a);
90  OMPRTL_NVPTX__kmpc_data_sharing_pop_stack,
91  /// Call to void __kmpc_begin_sharing_variables(void ***args,
92  /// size_t n_args);
93  OMPRTL_NVPTX__kmpc_begin_sharing_variables,
94  /// Call to void __kmpc_end_sharing_variables();
95  OMPRTL_NVPTX__kmpc_end_sharing_variables,
96  /// Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
97  OMPRTL_NVPTX__kmpc_get_shared_variables,
98  /// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32
99  /// global_tid);
100  OMPRTL_NVPTX__kmpc_parallel_level,
101  /// Call to int8_t __kmpc_is_spmd_exec_mode();
102  OMPRTL_NVPTX__kmpc_is_spmd_exec_mode,
103 };
104 
105 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
106 class NVPTXActionTy final : public PrePostActionTy {
107  llvm::Value *EnterCallee = nullptr;
108  ArrayRef<llvm::Value *> EnterArgs;
109  llvm::Value *ExitCallee = nullptr;
110  ArrayRef<llvm::Value *> ExitArgs;
111  bool Conditional = false;
112  llvm::BasicBlock *ContBlock = nullptr;
113 
114 public:
115  NVPTXActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs,
116  llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs,
117  bool Conditional = false)
118  : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
119  ExitArgs(ExitArgs), Conditional(Conditional) {}
120  void Enter(CodeGenFunction &CGF) override {
121  llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
122  if (Conditional) {
123  llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
124  auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
125  ContBlock = CGF.createBasicBlock("omp_if.end");
126  // Generate the branch (If-stmt)
127  CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
128  CGF.EmitBlock(ThenBlock);
129  }
130  }
131  void Done(CodeGenFunction &CGF) {
132  // Emit the rest of blocks/branches
133  CGF.EmitBranch(ContBlock);
134  CGF.EmitBlock(ContBlock, true);
135  }
136  void Exit(CodeGenFunction &CGF) override {
137  CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
138  }
139 };
140 
141 /// A class to track the execution mode when codegening directives within
142 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
143 /// to the target region and used by containing directives such as 'parallel'
144 /// to emit optimized code.
145 class ExecutionModeRAII {
146 private:
149 
150 public:
151  ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode, bool IsSPMD)
152  : Mode(Mode) {
153  SavedMode = Mode;
154  Mode = IsSPMD ? CGOpenMPRuntimeNVPTX::EM_SPMD
156  }
157  ~ExecutionModeRAII() { Mode = SavedMode; }
158 };
159 
160 /// GPU Configuration: This information can be derived from cuda registers,
161 /// however, providing compile time constants helps generate more efficient
162 /// code. For all practical purposes this is fine because the configuration
163 /// is the same for all known NVPTX architectures.
164 enum MachineConfiguration : unsigned {
165  WarpSize = 32,
166  /// Number of bits required to represent a lane identifier, which is
167  /// computed as log_2(WarpSize).
168  LaneIDBits = 5,
169  LaneIDMask = WarpSize - 1,
170 
171  /// Global memory alignment for performance.
172  GlobalMemoryAlignment = 256,
173 };
174 
175 enum NamedBarrier : unsigned {
176  /// Synchronize on this barrier #ID using a named barrier primitive.
177  /// Only the subset of active threads in a parallel region arrive at the
178  /// barrier.
179  NB_Parallel = 1,
180 };
181 
182 /// Get the list of variables that can escape their declaration context.
183 class CheckVarsEscapingDeclContext final
184  : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
185  CodeGenFunction &CGF;
186  llvm::SetVector<const ValueDecl *> EscapedDecls;
187  llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
188  llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
189  RecordDecl *GlobalizedRD = nullptr;
190  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
191  bool AllEscaped = false;
192  bool IsForCombinedParallelRegion = false;
193 
194  void markAsEscaped(const ValueDecl *VD) {
195  // Do not globalize declare target variables.
196  if (!isa<VarDecl>(VD) ||
197  OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
198  return;
199  VD = cast<ValueDecl>(VD->getCanonicalDecl());
200  // Variables captured by value must be globalized.
201  if (auto *CSI = CGF.CapturedStmtInfo) {
202  if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
203  // Check if need to capture the variable that was already captured by
204  // value in the outer region.
205  if (!IsForCombinedParallelRegion) {
206  if (!FD->hasAttrs())
207  return;
208  const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
209  if (!Attr)
210  return;
211  if (!isOpenMPPrivate(
212  static_cast<OpenMPClauseKind>(Attr->getCaptureKind())) ||
213  Attr->getCaptureKind() == OMPC_map)
214  return;
215  }
216  if (!FD->getType()->isReferenceType()) {
217  assert(!VD->getType()->isVariablyModifiedType() &&
218  "Parameter captured by value with variably modified type");
219  EscapedParameters.insert(VD);
220  } else if (!IsForCombinedParallelRegion) {
221  return;
222  }
223  }
224  }
225  if ((!CGF.CapturedStmtInfo ||
226  (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
227  VD->getType()->isReferenceType())
228  // Do not globalize variables with reference type.
229  return;
230  if (VD->getType()->isVariablyModifiedType())
231  EscapedVariableLengthDecls.insert(VD);
232  else
233  EscapedDecls.insert(VD);
234  }
235 
236  void VisitValueDecl(const ValueDecl *VD) {
237  if (VD->getType()->isLValueReferenceType())
238  markAsEscaped(VD);
239  if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
240  if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
241  const bool SavedAllEscaped = AllEscaped;
242  AllEscaped = VD->getType()->isLValueReferenceType();
243  Visit(VarD->getInit());
244  AllEscaped = SavedAllEscaped;
245  }
246  }
247  }
248  void VisitOpenMPCapturedStmt(const CapturedStmt *S,
249  ArrayRef<OMPClause *> Clauses,
250  bool IsCombinedParallelRegion) {
251  if (!S)
252  return;
253  for (const CapturedStmt::Capture &C : S->captures()) {
254  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
255  const ValueDecl *VD = C.getCapturedVar();
256  bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
257  if (IsCombinedParallelRegion) {
258  // Check if the variable is privatized in the combined construct and
259  // those private copies must be shared in the inner parallel
260  // directive.
261  IsForCombinedParallelRegion = false;
262  for (const OMPClause *C : Clauses) {
263  if (!isOpenMPPrivate(C->getClauseKind()) ||
264  C->getClauseKind() == OMPC_reduction ||
265  C->getClauseKind() == OMPC_linear ||
266  C->getClauseKind() == OMPC_private)
267  continue;
269  if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
270  Vars = PC->getVarRefs();
271  else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
272  Vars = PC->getVarRefs();
273  else
274  llvm_unreachable("Unexpected clause.");
275  for (const auto *E : Vars) {
276  const Decl *D =
277  cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
278  if (D == VD->getCanonicalDecl()) {
279  IsForCombinedParallelRegion = true;
280  break;
281  }
282  }
283  if (IsForCombinedParallelRegion)
284  break;
285  }
286  }
287  markAsEscaped(VD);
288  if (isa<OMPCapturedExprDecl>(VD))
289  VisitValueDecl(VD);
290  IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
291  }
292  }
293  }
294 
295  typedef std::pair<CharUnits /*Align*/, const ValueDecl *> VarsDataTy;
296  static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) {
297  return P1.first > P2.first;
298  }
299 
300  void buildRecordForGlobalizedVars() {
301  assert(!GlobalizedRD &&
302  "Record for globalized variables is built already.");
303  if (EscapedDecls.empty())
304  return;
305  ASTContext &C = CGF.getContext();
306  SmallVector<VarsDataTy, 4> GlobalizedVars;
307  for (const ValueDecl *D : EscapedDecls)
308  GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
309  std::stable_sort(GlobalizedVars.begin(), GlobalizedVars.end(),
311  // Build struct _globalized_locals_ty {
312  // /* globalized vars */
313  // };
314  GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
315  GlobalizedRD->startDefinition();
316  for (const auto &Pair : GlobalizedVars) {
317  const ValueDecl *VD = Pair.second;
318  QualType Type = VD->getType();
319  if (Type->isLValueReferenceType())
320  Type = C.getPointerType(Type.getNonReferenceType());
321  else
322  Type = Type.getNonReferenceType();
323  SourceLocation Loc = VD->getLocation();
324  auto *Field = FieldDecl::Create(
325  C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
327  /*BW=*/nullptr, /*Mutable=*/false,
328  /*InitStyle=*/ICIS_NoInit);
329  Field->setAccess(AS_public);
330  GlobalizedRD->addDecl(Field);
331  if (VD->hasAttrs()) {
332  for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
333  E(VD->getAttrs().end());
334  I != E; ++I)
335  Field->addAttr(*I);
336  }
337  MappedDeclsFields.try_emplace(VD, Field);
338  }
339  GlobalizedRD->completeDefinition();
340  }
341 
342 public:
343  CheckVarsEscapingDeclContext(CodeGenFunction &CGF) : CGF(CGF) {}
344  virtual ~CheckVarsEscapingDeclContext() = default;
345  void VisitDeclStmt(const DeclStmt *S) {
346  if (!S)
347  return;
348  for (const Decl *D : S->decls())
349  if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
350  VisitValueDecl(VD);
351  }
352  void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
353  if (!D)
354  return;
355  if (!D->hasAssociatedStmt())
356  return;
357  if (const auto *S =
358  dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
359  // Do not analyze directives that do not actually require capturing,
360  // like `omp for` or `omp simd` directives.
362  getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
363  if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
364  VisitStmt(S->getCapturedStmt());
365  return;
366  }
367  VisitOpenMPCapturedStmt(
368  S, D->clauses(),
369  CaptureRegions.back() == OMPD_parallel &&
371  }
372  }
373  void VisitCapturedStmt(const CapturedStmt *S) {
374  if (!S)
375  return;
376  for (const CapturedStmt::Capture &C : S->captures()) {
377  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
378  const ValueDecl *VD = C.getCapturedVar();
379  markAsEscaped(VD);
380  if (isa<OMPCapturedExprDecl>(VD))
381  VisitValueDecl(VD);
382  }
383  }
384  }
385  void VisitLambdaExpr(const LambdaExpr *E) {
386  if (!E)
387  return;
388  for (const LambdaCapture &C : E->captures()) {
389  if (C.capturesVariable()) {
390  if (C.getCaptureKind() == LCK_ByRef) {
391  const ValueDecl *VD = C.getCapturedVar();
392  markAsEscaped(VD);
393  if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
394  VisitValueDecl(VD);
395  }
396  }
397  }
398  }
399  void VisitBlockExpr(const BlockExpr *E) {
400  if (!E)
401  return;
402  for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
403  if (C.isByRef()) {
404  const VarDecl *VD = C.getVariable();
405  markAsEscaped(VD);
406  if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
407  VisitValueDecl(VD);
408  }
409  }
410  }
411  void VisitCallExpr(const CallExpr *E) {
412  if (!E)
413  return;
414  for (const Expr *Arg : E->arguments()) {
415  if (!Arg)
416  continue;
417  if (Arg->isLValue()) {
418  const bool SavedAllEscaped = AllEscaped;
419  AllEscaped = true;
420  Visit(Arg);
421  AllEscaped = SavedAllEscaped;
422  } else {
423  Visit(Arg);
424  }
425  }
426  Visit(E->getCallee());
427  }
428  void VisitDeclRefExpr(const DeclRefExpr *E) {
429  if (!E)
430  return;
431  const ValueDecl *VD = E->getDecl();
432  if (AllEscaped)
433  markAsEscaped(VD);
434  if (isa<OMPCapturedExprDecl>(VD))
435  VisitValueDecl(VD);
436  else if (const auto *VarD = dyn_cast<VarDecl>(VD))
437  if (VarD->isInitCapture())
438  VisitValueDecl(VD);
439  }
440  void VisitUnaryOperator(const UnaryOperator *E) {
441  if (!E)
442  return;
443  if (E->getOpcode() == UO_AddrOf) {
444  const bool SavedAllEscaped = AllEscaped;
445  AllEscaped = true;
446  Visit(E->getSubExpr());
447  AllEscaped = SavedAllEscaped;
448  } else {
449  Visit(E->getSubExpr());
450  }
451  }
452  void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
453  if (!E)
454  return;
455  if (E->getCastKind() == CK_ArrayToPointerDecay) {
456  const bool SavedAllEscaped = AllEscaped;
457  AllEscaped = true;
458  Visit(E->getSubExpr());
459  AllEscaped = SavedAllEscaped;
460  } else {
461  Visit(E->getSubExpr());
462  }
463  }
464  void VisitExpr(const Expr *E) {
465  if (!E)
466  return;
467  bool SavedAllEscaped = AllEscaped;
468  if (!E->isLValue())
469  AllEscaped = false;
470  for (const Stmt *Child : E->children())
471  if (Child)
472  Visit(Child);
473  AllEscaped = SavedAllEscaped;
474  }
475  void VisitStmt(const Stmt *S) {
476  if (!S)
477  return;
478  for (const Stmt *Child : S->children())
479  if (Child)
480  Visit(Child);
481  }
482 
483  /// Returns the record that handles all the escaped local variables and used
484  /// instead of their original storage.
485  const RecordDecl *getGlobalizedRecord() {
486  if (!GlobalizedRD)
487  buildRecordForGlobalizedVars();
488  return GlobalizedRD;
489  }
490 
491  /// Returns the field in the globalized record for the escaped variable.
492  const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
493  assert(GlobalizedRD &&
494  "Record for globalized variables must be generated already.");
495  auto I = MappedDeclsFields.find(VD);
496  if (I == MappedDeclsFields.end())
497  return nullptr;
498  return I->getSecond();
499  }
500 
501  /// Returns the list of the escaped local variables/parameters.
502  ArrayRef<const ValueDecl *> getEscapedDecls() const {
503  return EscapedDecls.getArrayRef();
504  }
505 
506  /// Checks if the escaped local variable is actually a parameter passed by
507  /// value.
508  const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
509  return EscapedParameters;
510  }
511 
512  /// Returns the list of the escaped variables with the variably modified
513  /// types.
514  ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
515  return EscapedVariableLengthDecls.getArrayRef();
516  }
517 };
518 } // anonymous namespace
519 
520 /// Get the GPU warp size.
522  return CGF.EmitRuntimeCall(
523  llvm::Intrinsic::getDeclaration(
524  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
525  "nvptx_warp_size");
526 }
527 
528 /// Get the id of the current thread on the GPU.
530  return CGF.EmitRuntimeCall(
531  llvm::Intrinsic::getDeclaration(
532  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
533  "nvptx_tid");
534 }
535 
536 /// Get the id of the warp in the block.
537 /// We assume that the warp size is 32, which is always the case
538 /// on the NVPTX device, to generate more efficient code.
540  CGBuilderTy &Bld = CGF.Builder;
541  return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
542 }
543 
544 /// Get the id of the current lane in the Warp.
545 /// We assume that the warp size is 32, which is always the case
546 /// on the NVPTX device, to generate more efficient code.
548  CGBuilderTy &Bld = CGF.Builder;
549  return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
550  "nvptx_lane_id");
551 }
552 
553 /// Get the maximum number of threads in a block of the GPU.
555  return CGF.EmitRuntimeCall(
556  llvm::Intrinsic::getDeclaration(
557  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
558  "nvptx_num_threads");
559 }
560 
561 /// Get barrier to synchronize all threads in a block.
563  CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
564  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
565 }
566 
567 /// Get barrier #ID to synchronize selected (multiple of warp size) threads in
568 /// a CTA.
569 static void getNVPTXBarrier(CodeGenFunction &CGF, int ID,
570  llvm::Value *NumThreads) {
571  CGBuilderTy &Bld = CGF.Builder;
572  llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads};
573  CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
574  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier),
575  Args);
576 }
577 
578 /// Synchronize all GPU threads in a block.
580 
581 /// Synchronize worker threads in a parallel region.
582 static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads) {
583  return getNVPTXBarrier(CGF, NB_Parallel, NumThreads);
584 }
585 
586 /// Get the value of the thread_limit clause in the teams directive.
587 /// For the 'generic' execution mode, the runtime encodes thread_limit in
588 /// the launch parameters, always starting thread_limit+warpSize threads per
589 /// CTA. The threads in the last warp are reserved for master execution.
590 /// For the 'spmd' execution mode, all threads in a CTA are part of the team.
592  bool IsInSPMDExecutionMode = false) {
593  CGBuilderTy &Bld = CGF.Builder;
594  return IsInSPMDExecutionMode
595  ? getNVPTXNumThreads(CGF)
596  : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
597  "thread_limit");
598 }
599 
600 /// Get the thread id of the OMP master thread.
601 /// The master thread id is the first thread (lane) of the last warp in the
602 /// GPU block. Warp size is assumed to be some power of 2.
603 /// Thread id is 0 indexed.
604 /// E.g: If NumThreads is 33, master id is 32.
605 /// If NumThreads is 64, master id is 32.
606 /// If NumThreads is 1024, master id is 992.
608  CGBuilderTy &Bld = CGF.Builder;
609  llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
610 
611  // We assume that the warp size is a power of 2.
612  llvm::Value *Mask = Bld.CreateNUWSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
613 
614  return Bld.CreateAnd(Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)),
615  Bld.CreateNot(Mask), "master_tid");
616 }
617 
618 CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
619  CodeGenModule &CGM, SourceLocation Loc)
620  : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()),
621  Loc(Loc) {
622  createWorkerFunction(CGM);
623 }
624 
625 void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
626  CodeGenModule &CGM) {
627  // Create an worker function with no arguments.
628 
629  WorkerFn = llvm::Function::Create(
631  /*placeholder=*/"_worker", &CGM.getModule());
632  CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI);
633  WorkerFn->setDoesNotRecurse();
634 }
635 
637 CGOpenMPRuntimeNVPTX::getExecutionMode() const {
638  return CurrentExecutionMode;
639 }
640 
643  return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeNVPTX::CUDA
645 }
646 
647 /// Checks if the \p Body is the \a CompoundStmt and returns its child statement
648 /// iff there is only one.
649 static const Stmt *getSingleCompoundChild(const Stmt *Body) {
650  if (const auto *C = dyn_cast<CompoundStmt>(Body))
651  if (C->size() == 1)
652  return C->body_front();
653  return Body;
654 }
655 
656 /// Check if the parallel directive has an 'if' clause with non-constant or
657 /// false condition. Also, check if the number of threads is strictly specified
658 /// and run those directives in non-SPMD mode.
660  const OMPExecutableDirective &D) {
662  return true;
663  for (const auto *C : D.getClausesOfKind<OMPIfClause>()) {
664  OpenMPDirectiveKind NameModifier = C->getNameModifier();
665  if (NameModifier != OMPD_parallel && NameModifier != OMPD_unknown)
666  continue;
667  const Expr *Cond = C->getCondition();
668  bool Result;
669  if (!Cond->EvaluateAsBooleanCondition(Result, Ctx) || !Result)
670  return true;
671  }
672  return false;
673 }
674 
675 /// Check for inner (nested) SPMD construct, if any
677  const OMPExecutableDirective &D) {
678  const auto *CS = D.getInnermostCapturedStmt();
679  const auto *Body = CS->getCapturedStmt()->IgnoreContainers();
680  const Stmt *ChildStmt = getSingleCompoundChild(Body);
681 
682  if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
683  OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
684  switch (D.getDirectiveKind()) {
685  case OMPD_target:
686  if (isOpenMPParallelDirective(DKind) &&
687  !hasParallelIfNumThreadsClause(Ctx, *NestedDir))
688  return true;
689  if (DKind == OMPD_teams || DKind == OMPD_teams_distribute) {
690  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers();
691  if (!Body)
692  return false;
693  ChildStmt = getSingleCompoundChild(Body);
694  if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
695  DKind = NND->getDirectiveKind();
696  if (isOpenMPParallelDirective(DKind) &&
697  !hasParallelIfNumThreadsClause(Ctx, *NND))
698  return true;
699  if (DKind == OMPD_distribute) {
700  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers();
701  if (!Body)
702  return false;
703  ChildStmt = getSingleCompoundChild(Body);
704  if (!ChildStmt)
705  return false;
706  if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
707  DKind = NND->getDirectiveKind();
708  return isOpenMPParallelDirective(DKind) &&
709  !hasParallelIfNumThreadsClause(Ctx, *NND);
710  }
711  }
712  }
713  }
714  return false;
715  case OMPD_target_teams:
716  if (isOpenMPParallelDirective(DKind) &&
717  !hasParallelIfNumThreadsClause(Ctx, *NestedDir))
718  return true;
719  if (DKind == OMPD_distribute) {
720  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers();
721  if (!Body)
722  return false;
723  ChildStmt = getSingleCompoundChild(Body);
724  if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
725  DKind = NND->getDirectiveKind();
726  return isOpenMPParallelDirective(DKind) &&
727  !hasParallelIfNumThreadsClause(Ctx, *NND);
728  }
729  }
730  return false;
731  case OMPD_target_teams_distribute:
732  return isOpenMPParallelDirective(DKind) &&
733  !hasParallelIfNumThreadsClause(Ctx, *NestedDir);
734  case OMPD_target_simd:
735  case OMPD_target_parallel:
736  case OMPD_target_parallel_for:
737  case OMPD_target_parallel_for_simd:
738  case OMPD_target_teams_distribute_simd:
739  case OMPD_target_teams_distribute_parallel_for:
740  case OMPD_target_teams_distribute_parallel_for_simd:
741  case OMPD_parallel:
742  case OMPD_for:
743  case OMPD_parallel_for:
744  case OMPD_parallel_sections:
745  case OMPD_for_simd:
746  case OMPD_parallel_for_simd:
747  case OMPD_cancel:
748  case OMPD_cancellation_point:
749  case OMPD_ordered:
750  case OMPD_threadprivate:
751  case OMPD_task:
752  case OMPD_simd:
753  case OMPD_sections:
754  case OMPD_section:
755  case OMPD_single:
756  case OMPD_master:
757  case OMPD_critical:
758  case OMPD_taskyield:
759  case OMPD_barrier:
760  case OMPD_taskwait:
761  case OMPD_taskgroup:
762  case OMPD_atomic:
763  case OMPD_flush:
764  case OMPD_teams:
765  case OMPD_target_data:
766  case OMPD_target_exit_data:
767  case OMPD_target_enter_data:
768  case OMPD_distribute:
769  case OMPD_distribute_simd:
770  case OMPD_distribute_parallel_for:
771  case OMPD_distribute_parallel_for_simd:
772  case OMPD_teams_distribute:
773  case OMPD_teams_distribute_simd:
774  case OMPD_teams_distribute_parallel_for:
775  case OMPD_teams_distribute_parallel_for_simd:
776  case OMPD_target_update:
777  case OMPD_declare_simd:
778  case OMPD_declare_target:
779  case OMPD_end_declare_target:
780  case OMPD_declare_reduction:
781  case OMPD_taskloop:
782  case OMPD_taskloop_simd:
783  case OMPD_unknown:
784  llvm_unreachable("Unexpected directive.");
785  }
786  }
787 
788  return false;
789 }
790 
792  const OMPExecutableDirective &D) {
793  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
794  switch (DirectiveKind) {
795  case OMPD_target:
796  case OMPD_target_teams:
797  case OMPD_target_teams_distribute:
798  return hasNestedSPMDDirective(Ctx, D);
799  case OMPD_target_parallel:
800  case OMPD_target_parallel_for:
801  case OMPD_target_parallel_for_simd:
802  case OMPD_target_teams_distribute_parallel_for:
803  case OMPD_target_teams_distribute_parallel_for_simd:
804  return !hasParallelIfNumThreadsClause(Ctx, D);
805  case OMPD_target_simd:
806  case OMPD_target_teams_distribute_simd:
807  return false;
808  case OMPD_parallel:
809  case OMPD_for:
810  case OMPD_parallel_for:
811  case OMPD_parallel_sections:
812  case OMPD_for_simd:
813  case OMPD_parallel_for_simd:
814  case OMPD_cancel:
815  case OMPD_cancellation_point:
816  case OMPD_ordered:
817  case OMPD_threadprivate:
818  case OMPD_task:
819  case OMPD_simd:
820  case OMPD_sections:
821  case OMPD_section:
822  case OMPD_single:
823  case OMPD_master:
824  case OMPD_critical:
825  case OMPD_taskyield:
826  case OMPD_barrier:
827  case OMPD_taskwait:
828  case OMPD_taskgroup:
829  case OMPD_atomic:
830  case OMPD_flush:
831  case OMPD_teams:
832  case OMPD_target_data:
833  case OMPD_target_exit_data:
834  case OMPD_target_enter_data:
835  case OMPD_distribute:
836  case OMPD_distribute_simd:
837  case OMPD_distribute_parallel_for:
838  case OMPD_distribute_parallel_for_simd:
839  case OMPD_teams_distribute:
840  case OMPD_teams_distribute_simd:
841  case OMPD_teams_distribute_parallel_for:
842  case OMPD_teams_distribute_parallel_for_simd:
843  case OMPD_target_update:
844  case OMPD_declare_simd:
845  case OMPD_declare_target:
846  case OMPD_end_declare_target:
847  case OMPD_declare_reduction:
848  case OMPD_taskloop:
849  case OMPD_taskloop_simd:
850  case OMPD_unknown:
851  break;
852  }
853  llvm_unreachable(
854  "Unknown programming model for OpenMP directive on NVPTX target.");
855 }
856 
857 void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
858  StringRef ParentName,
859  llvm::Function *&OutlinedFn,
860  llvm::Constant *&OutlinedFnID,
861  bool IsOffloadEntry,
862  const RegionCodeGenTy &CodeGen) {
863  ExecutionModeRAII ModeRAII(CurrentExecutionMode, /*IsSPMD=*/false);
864  EntryFunctionState EST;
865  WorkerFunctionState WST(CGM, D.getLocStart());
866  Work.clear();
867  WrapperFunctionsMap.clear();
868 
869  // Emit target region as a standalone region.
870  class NVPTXPrePostActionTy : public PrePostActionTy {
871  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
872  CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
873 
874  public:
875  NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
876  CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
877  : EST(EST), WST(WST) {}
878  void Enter(CodeGenFunction &CGF) override {
879  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
880  .emitNonSPMDEntryHeader(CGF, EST, WST);
881  }
882  void Exit(CodeGenFunction &CGF) override {
883  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
884  .emitNonSPMDEntryFooter(CGF, EST);
885  }
886  } Action(EST, WST);
887  CodeGen.setAction(Action);
888  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
889  IsOffloadEntry, CodeGen);
890 
891  // Now change the name of the worker function to correspond to this target
892  // region's entry function.
893  WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker"));
894 
895  // Create the worker function
896  emitWorkerFunction(WST);
897 }
898 
899 // Setup NVPTX threads for master-worker OpenMP scheme.
900 void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
901  EntryFunctionState &EST,
902  WorkerFunctionState &WST) {
903  CGBuilderTy &Bld = CGF.Builder;
904 
905  llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
906  llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
907  llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
908  EST.ExitBB = CGF.createBasicBlock(".exit");
909 
910  llvm::Value *IsWorker =
911  Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
912  Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
913 
914  CGF.EmitBlock(WorkerBB);
915  emitCall(CGF, WST.Loc, WST.WorkerFn);
916  CGF.EmitBranch(EST.ExitBB);
917 
918  CGF.EmitBlock(MasterCheckBB);
919  llvm::Value *IsMaster =
920  Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
921  Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
922 
923  CGF.EmitBlock(MasterBB);
924  IsInTargetMasterThreadRegion = true;
925  // SEQUENTIAL (MASTER) REGION START
926  // First action in sequential region:
927  // Initialize the state of the OpenMP runtime library on the GPU.
928  // TODO: Optimize runtime initialization and pass in correct value.
929  llvm::Value *Args[] = {getThreadLimit(CGF),
930  Bld.getInt16(/*RequiresOMPRuntime=*/1)};
931  CGF.EmitRuntimeCall(
932  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
933 
934  // For data sharing, we need to initialize the stack.
935  CGF.EmitRuntimeCall(
936  createNVPTXRuntimeFunction(
937  OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
938 
939  emitGenericVarsProlog(CGF, WST.Loc);
940 }
941 
942 void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
943  EntryFunctionState &EST) {
944  IsInTargetMasterThreadRegion = false;
945  if (!CGF.HaveInsertPoint())
946  return;
947 
948  emitGenericVarsEpilog(CGF);
949 
950  if (!EST.ExitBB)
951  EST.ExitBB = CGF.createBasicBlock(".exit");
952 
953  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
954  CGF.EmitBranch(TerminateBB);
955 
956  CGF.EmitBlock(TerminateBB);
957  // Signal termination condition.
958  // TODO: Optimize runtime initialization and pass in correct value.
959  llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
960  CGF.EmitRuntimeCall(
961  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);
962  // Barrier to terminate worker threads.
963  syncCTAThreads(CGF);
964  // Master thread jumps to exit point.
965  CGF.EmitBranch(EST.ExitBB);
966 
967  CGF.EmitBlock(EST.ExitBB);
968  EST.ExitBB = nullptr;
969 }
970 
971 void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D,
972  StringRef ParentName,
973  llvm::Function *&OutlinedFn,
974  llvm::Constant *&OutlinedFnID,
975  bool IsOffloadEntry,
976  const RegionCodeGenTy &CodeGen) {
977  ExecutionModeRAII ModeRAII(CurrentExecutionMode, /*IsSPMD=*/true);
978  EntryFunctionState EST;
979 
980  // Emit target region as a standalone region.
981  class NVPTXPrePostActionTy : public PrePostActionTy {
983  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
984  const OMPExecutableDirective &D;
985 
986  public:
987  NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
988  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
989  const OMPExecutableDirective &D)
990  : RT(RT), EST(EST), D(D) {}
991  void Enter(CodeGenFunction &CGF) override {
992  RT.emitSPMDEntryHeader(CGF, EST, D);
993  }
994  void Exit(CodeGenFunction &CGF) override {
995  RT.emitSPMDEntryFooter(CGF, EST);
996  }
997  } Action(*this, EST, D);
998  CodeGen.setAction(Action);
999  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1000  IsOffloadEntry, CodeGen);
1001 }
1002 
1003 void CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader(
1004  CodeGenFunction &CGF, EntryFunctionState &EST,
1005  const OMPExecutableDirective &D) {
1006  CGBuilderTy &Bld = CGF.Builder;
1007 
1008  // Setup BBs in entry function.
1009  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
1010  EST.ExitBB = CGF.createBasicBlock(".exit");
1011 
1012  // Initialize the OMP state in the runtime; called by all active threads.
1013  // TODO: Set RequiresOMPRuntime and RequiresDataSharing parameters
1014  // based on code analysis of the target region.
1015  llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true),
1016  /*RequiresOMPRuntime=*/Bld.getInt16(1),
1017  /*RequiresDataSharing=*/Bld.getInt16(1)};
1018  CGF.EmitRuntimeCall(
1019  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
1020 
1021  // For data sharing, we need to initialize the stack.
1022  CGF.EmitRuntimeCall(
1023  createNVPTXRuntimeFunction(
1024  OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd));
1025 
1026  CGF.EmitBranch(ExecuteBB);
1027 
1028  CGF.EmitBlock(ExecuteBB);
1029 
1030  IsInTargetMasterThreadRegion = true;
1031 }
1032 
1033 void CGOpenMPRuntimeNVPTX::emitSPMDEntryFooter(CodeGenFunction &CGF,
1034  EntryFunctionState &EST) {
1035  IsInTargetMasterThreadRegion = false;
1036  if (!CGF.HaveInsertPoint())
1037  return;
1038 
1039  if (!EST.ExitBB)
1040  EST.ExitBB = CGF.createBasicBlock(".exit");
1041 
1042  llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
1043  CGF.EmitBranch(OMPDeInitBB);
1044 
1045  CGF.EmitBlock(OMPDeInitBB);
1046  // DeInitialize the OMP state in the runtime; called by all active threads.
1047  CGF.EmitRuntimeCall(
1048  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_deinit), None);
1049  CGF.EmitBranch(EST.ExitBB);
1050 
1051  CGF.EmitBlock(EST.ExitBB);
1052  EST.ExitBB = nullptr;
1053 }
1054 
1055 // Create a unique global variable to indicate the execution mode of this target
1056 // region. The execution mode is either 'generic', or 'spmd' depending on the
1057 // target directive. This variable is picked up by the offload library to setup
1058 // the device appropriately before kernel launch. If the execution mode is
1059 // 'generic', the runtime reserves one warp for the master, otherwise, all
1060 // warps participate in parallel work.
1061 static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
1062  bool Mode) {
1063  auto *GVMode =
1064  new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1065  llvm::GlobalValue::WeakAnyLinkage,
1066  llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
1067  Twine(Name, "_exec_mode"));
1068  CGM.addCompilerUsedGlobal(GVMode);
1069 }
1070 
1071 void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
1072  ASTContext &Ctx = CGM.getContext();
1073 
1074  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
1075  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {},
1076  WST.Loc, WST.Loc);
1077  emitWorkerLoop(CGF, WST);
1078  CGF.FinishFunction();
1079 }
1080 
1081 void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
1082  WorkerFunctionState &WST) {
1083  //
1084  // The workers enter this loop and wait for parallel work from the master.
1085  // When the master encounters a parallel region it sets up the work + variable
1086  // arguments, and wakes up the workers. The workers first check to see if
1087  // they are required for the parallel region, i.e., within the # of requested
1088  // parallel threads. The activated workers load the variable arguments and
1089  // execute the parallel work.
1090  //
1091 
1092  CGBuilderTy &Bld = CGF.Builder;
1093 
1094  llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
1095  llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
1096  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
1097  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
1098  llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
1099  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1100 
1101  CGF.EmitBranch(AwaitBB);
1102 
1103  // Workers wait for work from master.
1104  CGF.EmitBlock(AwaitBB);
1105  // Wait for parallel work
1106  syncCTAThreads(CGF);
1107 
1108  Address WorkFn =
1109  CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
1110  Address ExecStatus =
1111  CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
1112  CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
1113  CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
1114 
1115  // TODO: Optimize runtime initialization and pass in correct value.
1116  llvm::Value *Args[] = {WorkFn.getPointer(),
1117  /*RequiresOMPRuntime=*/Bld.getInt16(1)};
1118  llvm::Value *Ret = CGF.EmitRuntimeCall(
1119  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
1120  Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
1121 
1122  // On termination condition (workid == 0), exit loop.
1123  llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
1124  llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
1125  Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
1126 
1127  // Activate requested workers.
1128  CGF.EmitBlock(SelectWorkersBB);
1129  llvm::Value *IsActive =
1130  Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
1131  Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
1132 
1133  // Signal start of parallel region.
1134  CGF.EmitBlock(ExecuteBB);
1135 
1136  // Process work items: outlined parallel functions.
1137  for (llvm::Function *W : Work) {
1138  // Try to match this outlined function.
1140 
1141  llvm::Value *WorkFnMatch =
1142  Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
1143 
1144  llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
1145  llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
1146  Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
1147 
1148  // Execute this outlined function.
1149  CGF.EmitBlock(ExecuteFNBB);
1150 
1151  // Insert call to work function via shared wrapper. The shared
1152  // wrapper takes two arguments:
1153  // - the parallelism level;
1154  // - the thread ID;
1155  emitCall(CGF, WST.Loc, W,
1156  {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1157 
1158  // Go to end of parallel region.
1159  CGF.EmitBranch(TerminateBB);
1160 
1161  CGF.EmitBlock(CheckNextBB);
1162  }
1163  // Default case: call to outlined function through pointer if the target
1164  // region makes a declare target call that may contain an orphaned parallel
1165  // directive.
1166  auto *ParallelFnTy =
1167  llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
1168  /*isVarArg=*/false)
1169  ->getPointerTo();
1170  llvm::Value *WorkFnCast = Bld.CreateBitCast(WorkID, ParallelFnTy);
1171  // Insert call to work function via shared wrapper. The shared
1172  // wrapper takes two arguments:
1173  // - the parallelism level;
1174  // - the thread ID;
1175  emitCall(CGF, WST.Loc, WorkFnCast,
1176  {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1177  // Go to end of parallel region.
1178  CGF.EmitBranch(TerminateBB);
1179 
1180  // Signal end of parallel region.
1181  CGF.EmitBlock(TerminateBB);
1182  CGF.EmitRuntimeCall(
1183  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
1184  llvm::None);
1185  CGF.EmitBranch(BarrierBB);
1186 
1187  // All active and inactive workers wait at a barrier after parallel region.
1188  CGF.EmitBlock(BarrierBB);
1189  // Barrier after parallel region.
1190  syncCTAThreads(CGF);
1191  CGF.EmitBranch(AwaitBB);
1192 
1193  // Exit target region.
1194  CGF.EmitBlock(ExitBB);
1195 }
1196 
1197 /// Returns specified OpenMP runtime function for the current OpenMP
1198 /// implementation. Specialized for the NVPTX device.
1199 /// \param Function OpenMP runtime function.
1200 /// \return Specified function.
1201 llvm::Constant *
1203  llvm::Constant *RTLFn = nullptr;
1204  switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
1205  case OMPRTL_NVPTX__kmpc_kernel_init: {
1206  // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t
1207  // RequiresOMPRuntime);
1208  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};
1209  auto *FnTy =
1210  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1211  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
1212  break;
1213  }
1214  case OMPRTL_NVPTX__kmpc_kernel_deinit: {
1215  // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
1216  llvm::Type *TypeParams[] = {CGM.Int16Ty};
1217  auto *FnTy =
1218  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1219  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
1220  break;
1221  }
1222  case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
1223  // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
1224  // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
1225  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
1226  auto *FnTy =
1227  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1228  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
1229  break;
1230  }
1231  case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: {
1232  // Build void __kmpc_spmd_kernel_deinit();
1233  auto *FnTy =
1234  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1235  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit");
1236  break;
1237  }
1238  case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
1239  /// Build void __kmpc_kernel_prepare_parallel(
1240  /// void *outlined_function, int16_t IsOMPRuntimeInitialized);
1241  llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty};
1242  auto *FnTy =
1243  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1244  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
1245  break;
1246  }
1247  case OMPRTL_NVPTX__kmpc_kernel_parallel: {
1248  /// Build bool __kmpc_kernel_parallel(void **outlined_function,
1249  /// int16_t IsOMPRuntimeInitialized);
1250  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, CGM.Int16Ty};
1251  llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
1252  auto *FnTy =
1253  llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
1254  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
1255  break;
1256  }
1257  case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
1258  /// Build void __kmpc_kernel_end_parallel();
1259  auto *FnTy =
1260  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1261  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
1262  break;
1263  }
1264  case OMPRTL_NVPTX__kmpc_serialized_parallel: {
1265  // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
1266  // global_tid);
1267  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1268  auto *FnTy =
1269  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1270  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
1271  break;
1272  }
1273  case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
1274  // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
1275  // global_tid);
1276  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1277  auto *FnTy =
1278  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1279  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
1280  break;
1281  }
1282  case OMPRTL_NVPTX__kmpc_shuffle_int32: {
1283  // Build int32_t __kmpc_shuffle_int32(int32_t element,
1284  // int16_t lane_offset, int16_t warp_size);
1285  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
1286  auto *FnTy =
1287  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
1288  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
1289  break;
1290  }
1291  case OMPRTL_NVPTX__kmpc_shuffle_int64: {
1292  // Build int64_t __kmpc_shuffle_int64(int64_t element,
1293  // int16_t lane_offset, int16_t warp_size);
1294  llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
1295  auto *FnTy =
1296  llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false);
1297  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
1298  break;
1299  }
1300  case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: {
1301  // Build int32_t kmpc_nvptx_parallel_reduce_nowait(kmp_int32 global_tid,
1302  // kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
1303  // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1304  // lane_offset, int16_t Algorithm Version),
1305  // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
1306  llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1307  CGM.Int16Ty, CGM.Int16Ty};
1308  auto *ShuffleReduceFnTy =
1309  llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1310  /*isVarArg=*/false);
1311  llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1312  auto *InterWarpCopyFnTy =
1313  llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1314  /*isVarArg=*/false);
1315  llvm::Type *TypeParams[] = {CGM.Int32Ty,
1316  CGM.Int32Ty,
1317  CGM.SizeTy,
1318  CGM.VoidPtrTy,
1319  ShuffleReduceFnTy->getPointerTo(),
1320  InterWarpCopyFnTy->getPointerTo()};
1321  auto *FnTy =
1322  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1323  RTLFn = CGM.CreateRuntimeFunction(
1324  FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait");
1325  break;
1326  }
1327  case OMPRTL_NVPTX__kmpc_simd_reduce_nowait: {
1328  // Build int32_t kmpc_nvptx_simd_reduce_nowait(kmp_int32 global_tid,
1329  // kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
1330  // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1331  // lane_offset, int16_t Algorithm Version),
1332  // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
1333  llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1334  CGM.Int16Ty, CGM.Int16Ty};
1335  auto *ShuffleReduceFnTy =
1336  llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1337  /*isVarArg=*/false);
1338  llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1339  auto *InterWarpCopyFnTy =
1340  llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1341  /*isVarArg=*/false);
1342  llvm::Type *TypeParams[] = {CGM.Int32Ty,
1343  CGM.Int32Ty,
1344  CGM.SizeTy,
1345  CGM.VoidPtrTy,
1346  ShuffleReduceFnTy->getPointerTo(),
1347  InterWarpCopyFnTy->getPointerTo()};
1348  auto *FnTy =
1349  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1350  RTLFn = CGM.CreateRuntimeFunction(
1351  FnTy, /*Name=*/"__kmpc_nvptx_simd_reduce_nowait");
1352  break;
1353  }
1354  case OMPRTL_NVPTX__kmpc_teams_reduce_nowait: {
1355  // Build int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
1356  // int32_t num_vars, size_t reduce_size, void *reduce_data,
1357  // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1358  // lane_offset, int16_t shortCircuit),
1359  // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
1360  // void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
1361  // int32_t index, int32_t width),
1362  // void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad,
1363  // int32_t index, int32_t width, int32_t reduce))
1364  llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1365  CGM.Int16Ty, CGM.Int16Ty};
1366  auto *ShuffleReduceFnTy =
1367  llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1368  /*isVarArg=*/false);
1369  llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1370  auto *InterWarpCopyFnTy =
1371  llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1372  /*isVarArg=*/false);
1373  llvm::Type *CopyToScratchpadTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy,
1374  CGM.Int32Ty, CGM.Int32Ty};
1375  auto *CopyToScratchpadFnTy =
1376  llvm::FunctionType::get(CGM.VoidTy, CopyToScratchpadTypeParams,
1377  /*isVarArg=*/false);
1378  llvm::Type *LoadReduceTypeParams[] = {
1379  CGM.VoidPtrTy, CGM.VoidPtrTy, CGM.Int32Ty, CGM.Int32Ty, CGM.Int32Ty};
1380  auto *LoadReduceFnTy =
1381  llvm::FunctionType::get(CGM.VoidTy, LoadReduceTypeParams,
1382  /*isVarArg=*/false);
1383  llvm::Type *TypeParams[] = {CGM.Int32Ty,
1384  CGM.Int32Ty,
1385  CGM.SizeTy,
1386  CGM.VoidPtrTy,
1387  ShuffleReduceFnTy->getPointerTo(),
1388  InterWarpCopyFnTy->getPointerTo(),
1389  CopyToScratchpadFnTy->getPointerTo(),
1390  LoadReduceFnTy->getPointerTo()};
1391  auto *FnTy =
1392  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1393  RTLFn = CGM.CreateRuntimeFunction(
1394  FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait");
1395  break;
1396  }
1397  case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
1398  // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
1399  llvm::Type *TypeParams[] = {CGM.Int32Ty};
1400  auto *FnTy =
1401  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1402  RTLFn = CGM.CreateRuntimeFunction(
1403  FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
1404  break;
1405  }
1406  case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: {
1407  /// Build void __kmpc_data_sharing_init_stack();
1408  auto *FnTy =
1409  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1410  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack");
1411  break;
1412  }
1413  case OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd: {
1414  /// Build void __kmpc_data_sharing_init_stack_spmd();
1415  auto *FnTy =
1416  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1417  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd");
1418  break;
1419  }
1420  case OMPRTL_NVPTX__kmpc_data_sharing_push_stack: {
1421  // Build void *__kmpc_data_sharing_push_stack(size_t size,
1422  // int16_t UseSharedMemory);
1423  llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty};
1424  auto *FnTy =
1425  llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
1426  RTLFn = CGM.CreateRuntimeFunction(
1427  FnTy, /*Name=*/"__kmpc_data_sharing_push_stack");
1428  break;
1429  }
1430  case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: {
1431  // Build void __kmpc_data_sharing_pop_stack(void *a);
1432  llvm::Type *TypeParams[] = {CGM.VoidPtrTy};
1433  auto *FnTy =
1434  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1435  RTLFn = CGM.CreateRuntimeFunction(FnTy,
1436  /*Name=*/"__kmpc_data_sharing_pop_stack");
1437  break;
1438  }
1439  case OMPRTL_NVPTX__kmpc_begin_sharing_variables: {
1440  /// Build void __kmpc_begin_sharing_variables(void ***args,
1441  /// size_t n_args);
1442  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy};
1443  auto *FnTy =
1444  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1445  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables");
1446  break;
1447  }
1448  case OMPRTL_NVPTX__kmpc_end_sharing_variables: {
1449  /// Build void __kmpc_end_sharing_variables();
1450  auto *FnTy =
1451  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1452  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables");
1453  break;
1454  }
1455  case OMPRTL_NVPTX__kmpc_get_shared_variables: {
1456  /// Build void __kmpc_get_shared_variables(void ***GlobalArgs);
1457  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()};
1458  auto *FnTy =
1459  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1460  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
1461  break;
1462  }
1463  case OMPRTL_NVPTX__kmpc_parallel_level: {
1464  // Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid);
1465  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1466  auto *FnTy =
1467  llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /*isVarArg*/ false);
1468  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_parallel_level");
1469  break;
1470  }
1471  case OMPRTL_NVPTX__kmpc_is_spmd_exec_mode: {
1472  // Build int8_t __kmpc_is_spmd_exec_mode();
1473  auto *FnTy = llvm::FunctionType::get(CGM.Int8Ty, /*isVarArg=*/false);
1474  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_is_spmd_exec_mode");
1475  break;
1476  }
1477  }
1478  return RTLFn;
1479 }
1480 
1481 void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
1482  llvm::Constant *Addr,
1483  uint64_t Size, int32_t,
1484  llvm::GlobalValue::LinkageTypes) {
1485  // TODO: Add support for global variables on the device after declare target
1486  // support.
1487  if (!isa<llvm::Function>(Addr))
1488  return;
1489  llvm::Module &M = CGM.getModule();
1490  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
1491 
1492  // Get "nvvm.annotations" metadata node
1493  llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
1494 
1495  llvm::Metadata *MDVals[] = {
1496  llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
1497  llvm::ConstantAsMetadata::get(
1498  llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1499  // Append metadata to nvvm.annotations
1500  MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1501 }
1502 
1503 void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
1504  const OMPExecutableDirective &D, StringRef ParentName,
1505  llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
1506  bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
1507  if (!IsOffloadEntry) // Nothing to do.
1508  return;
1509 
1510  assert(!ParentName.empty() && "Invalid target region parent name!");
1511 
1512  bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
1513  if (Mode)
1514  emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1515  CodeGen);
1516  else
1517  emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1518  CodeGen);
1519 
1520  setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
1521 }
1522 
1524  : CGOpenMPRuntime(CGM, "_", "$") {
1525  if (!CGM.getLangOpts().OpenMPIsDevice)
1526  llvm_unreachable("OpenMP NVPTX can only handle device code.");
1527 }
1528 
1530  OpenMPProcBindClauseKind ProcBind,
1531  SourceLocation Loc) {
1532  // Do nothing in case of SPMD mode and L0 parallel.
1533  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1534  return;
1535 
1536  CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1537 }
1538 
1540  llvm::Value *NumThreads,
1541  SourceLocation Loc) {
1542  // Do nothing in case of SPMD mode and L0 parallel.
1543  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1544  return;
1545 
1546  CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1547 }
1548 
1550  const Expr *NumTeams,
1551  const Expr *ThreadLimit,
1552  SourceLocation Loc) {}
1553 
1555  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1556  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1557  // Emit target region as a standalone region.
1558  class NVPTXPrePostActionTy : public PrePostActionTy {
1559  bool &IsInParallelRegion;
1560  bool PrevIsInParallelRegion;
1561 
1562  public:
1563  NVPTXPrePostActionTy(bool &IsInParallelRegion)
1564  : IsInParallelRegion(IsInParallelRegion) {}
1565  void Enter(CodeGenFunction &CGF) override {
1566  PrevIsInParallelRegion = IsInParallelRegion;
1567  IsInParallelRegion = true;
1568  }
1569  void Exit(CodeGenFunction &CGF) override {
1570  IsInParallelRegion = PrevIsInParallelRegion;
1571  }
1572  } Action(IsInParallelRegion);
1573  CodeGen.setAction(Action);
1574  bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1575  IsInTargetMasterThreadRegion = false;
1576  auto *OutlinedFun =
1577  cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1578  D, ThreadIDVar, InnermostKind, CodeGen));
1579  IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
1580  if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD &&
1581  !IsInParallelRegion) {
1582  llvm::Function *WrapperFun =
1583  createParallelDataSharingWrapper(OutlinedFun, D);
1584  WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1585  }
1586 
1587  return OutlinedFun;
1588 }
1589 
1591  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1592  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1593  SourceLocation Loc = D.getLocStart();
1594 
1595  // Emit target region as a standalone region.
1596  class NVPTXPrePostActionTy : public PrePostActionTy {
1597  SourceLocation &Loc;
1598 
1599  public:
1600  NVPTXPrePostActionTy(SourceLocation &Loc) : Loc(Loc) {}
1601  void Enter(CodeGenFunction &CGF) override {
1602  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
1603  .emitGenericVarsProlog(CGF, Loc);
1604  }
1605  void Exit(CodeGenFunction &CGF) override {
1606  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
1607  .emitGenericVarsEpilog(CGF);
1608  }
1609  } Action(Loc);
1610  CodeGen.setAction(Action);
1612  D, ThreadIDVar, InnermostKind, CodeGen);
1613  llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
1614  OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
1615  OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
1616  OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
1617 
1618  return OutlinedFun;
1619 }
1620 
1621 void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
1622  SourceLocation Loc) {
1624  return;
1625 
1626  CGBuilderTy &Bld = CGF.Builder;
1627 
1628  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1629  if (I == FunctionGlobalizedDecls.end())
1630  return;
1631  if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
1632  QualType RecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
1633 
1634  // Recover pointer to this function's global record. The runtime will
1635  // handle the specifics of the allocation of the memory.
1636  // Use actual memory size of the record including the padding
1637  // for alignment purposes.
1638  unsigned Alignment =
1640  unsigned GlobalRecordSize =
1641  CGM.getContext().getTypeSizeInChars(RecTy).getQuantity();
1642  GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
1643  // TODO: allow the usage of shared memory to be controlled by
1644  // the user, for now, default to global.
1645  llvm::Value *GlobalRecordSizeArg[] = {
1646  llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
1647  CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
1648  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
1649  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_push_stack),
1650  GlobalRecordSizeArg);
1651  llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1652  GlobalRecValue, CGF.ConvertTypeForMem(RecTy)->getPointerTo());
1653  LValue Base =
1654  CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, RecTy);
1655  I->getSecond().GlobalRecordAddr = GlobalRecValue;
1656 
1657  // Emit the "global alloca" which is a GEP from the global declaration
1658  // record using the pointer returned by the runtime.
1659  for (auto &Rec : I->getSecond().LocalVarData) {
1660  bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1661  llvm::Value *ParValue;
1662  if (EscapedParam) {
1663  const auto *VD = cast<VarDecl>(Rec.first);
1664  LValue ParLVal =
1665  CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1666  ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1667  }
1668  const FieldDecl *FD = Rec.second.first;
1669  LValue VarAddr = CGF.EmitLValueForField(Base, FD);
1670  Rec.second.second = VarAddr.getAddress();
1671  if (EscapedParam) {
1672  const auto *VD = cast<VarDecl>(Rec.first);
1673  CGF.EmitStoreOfScalar(ParValue, VarAddr);
1674  I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
1675  }
1676  }
1677  }
1678  for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
1679  // Recover pointer to this function's global record. The runtime will
1680  // handle the specifics of the allocation of the memory.
1681  // Use actual memory size of the record including the padding
1682  // for alignment purposes.
1683  CGBuilderTy &Bld = CGF.Builder;
1684  llvm::Value *Size = CGF.getTypeSize(VD->getType());
1685  CharUnits Align = CGM.getContext().getDeclAlign(VD);
1686  Size = Bld.CreateNUWAdd(
1687  Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1688  llvm::Value *AlignVal =
1689  llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1690  Size = Bld.CreateUDiv(Size, AlignVal);
1691  Size = Bld.CreateNUWMul(Size, AlignVal);
1692  // TODO: allow the usage of shared memory to be controlled by
1693  // the user, for now, default to global.
1694  llvm::Value *GlobalRecordSizeArg[] = {
1695  Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
1696  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
1697  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_push_stack),
1698  GlobalRecordSizeArg);
1699  llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1700  GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
1701  LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
1702  CGM.getContext().getDeclAlign(VD),
1704  I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
1705  Base.getAddress());
1706  I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
1707  }
1708  I->getSecond().MappedParams->apply(CGF);
1709 }
1710 
1711 void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF) {
1713  return;
1714 
1715  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1716  if (I != FunctionGlobalizedDecls.end()) {
1717  I->getSecond().MappedParams->restore(CGF);
1718  if (!CGF.HaveInsertPoint())
1719  return;
1720  for (llvm::Value *Addr :
1721  llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1722  CGF.EmitRuntimeCall(
1723  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
1724  Addr);
1725  }
1726  if (I->getSecond().GlobalRecordAddr) {
1727  CGF.EmitRuntimeCall(
1728  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
1729  I->getSecond().GlobalRecordAddr);
1730  }
1731  }
1732 }
1733 
1735  const OMPExecutableDirective &D,
1736  SourceLocation Loc,
1737  llvm::Value *OutlinedFn,
1738  ArrayRef<llvm::Value *> CapturedVars) {
1739  if (!CGF.HaveInsertPoint())
1740  return;
1741 
1742  Address ZeroAddr = CGF.CreateMemTemp(
1743  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
1744  /*Name*/ ".zero.addr");
1745  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
1746  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1747  OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
1748  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1749  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1750  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
1751 }
1752 
1754  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
1755  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
1756  if (!CGF.HaveInsertPoint())
1757  return;
1758 
1759  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1760  emitSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
1761  else
1762  emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
1763 }
1764 
1765 void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
1766  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
1767  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
1768  llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
1769 
1770  // Force inline this outlined function at its call site.
1771  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
1772 
1774  /*DestWidth=*/32, /*Signed=*/1),
1775  ".zero.addr");
1776  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
1777  // ThreadId for serialized parallels is 0.
1778  Address ThreadIDAddr = ZeroAddr;
1779  auto &&CodeGen = [this, Fn, CapturedVars, Loc, ZeroAddr, &ThreadIDAddr](
1780  CodeGenFunction &CGF, PrePostActionTy &Action) {
1781  Action.Enter(CGF);
1782 
1783  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1784  OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
1785  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1786  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1787  emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
1788  };
1789  auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
1790  PrePostActionTy &) {
1791 
1792  RegionCodeGenTy RCG(CodeGen);
1793  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1794  llvm::Value *ThreadID = getThreadID(CGF, Loc);
1795  llvm::Value *Args[] = {RTLoc, ThreadID};
1796 
1797  NVPTXActionTy Action(
1798  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
1799  Args,
1800  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
1801  Args);
1802  RCG.setAction(Action);
1803  RCG(CGF);
1804  };
1805 
1806  auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF,
1807  PrePostActionTy &Action) {
1808  CGBuilderTy &Bld = CGF.Builder;
1809  llvm::Function *WFn = WrapperFunctionsMap[Fn];
1810  assert(WFn && "Wrapper function does not exist!");
1811  llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
1812 
1813  // Prepare for parallel region. Indicate the outlined function.
1814  llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)};
1815  CGF.EmitRuntimeCall(
1816  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
1817  Args);
1818 
1819  // Create a private scope that will globalize the arguments
1820  // passed from the outside of the target region.
1821  CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1822 
1823  // There's somehting to share.
1824  if (!CapturedVars.empty()) {
1825  // Prepare for parallel region. Indicate the outlined function.
1826  Address SharedArgs =
1827  CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs");
1828  llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
1829 
1830  llvm::Value *DataSharingArgs[] = {
1831  SharedArgsPtr,
1832  llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
1833  CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
1834  OMPRTL_NVPTX__kmpc_begin_sharing_variables),
1835  DataSharingArgs);
1836 
1837  // Store variable address in a list of references to pass to workers.
1838  unsigned Idx = 0;
1839  ASTContext &Ctx = CGF.getContext();
1840  Address SharedArgListAddress = CGF.EmitLoadOfPointer(
1841  SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
1842  .castAs<PointerType>());
1843  for (llvm::Value *V : CapturedVars) {
1844  Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
1845  CGF.getPointerSize());
1846  llvm::Value *PtrV;
1847  if (V->getType()->isIntegerTy())
1848  PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1849  else
1850  PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
1851  CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
1852  Ctx.getPointerType(Ctx.VoidPtrTy));
1853  ++Idx;
1854  }
1855  }
1856 
1857  // Activate workers. This barrier is used by the master to signal
1858  // work for the workers.
1859  syncCTAThreads(CGF);
1860 
1861  // OpenMP [2.5, Parallel Construct, p.49]
1862  // There is an implied barrier at the end of a parallel region. After the
1863  // end of a parallel region, only the master thread of the team resumes
1864  // execution of the enclosing task region.
1865  //
1866  // The master waits at this barrier until all workers are done.
1867  syncCTAThreads(CGF);
1868 
1869  if (!CapturedVars.empty())
1870  CGF.EmitRuntimeCall(
1871  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables));
1872 
1873  // Remember for post-processing in worker loop.
1874  Work.emplace_back(WFn);
1875  };
1876 
1877  auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen, &CodeGen,
1878  &ThreadIDAddr](CodeGenFunction &CGF,
1879  PrePostActionTy &Action) {
1880  RegionCodeGenTy RCG(CodeGen);
1881  if (IsInParallelRegion) {
1882  SeqGen(CGF, Action);
1883  } else if (IsInTargetMasterThreadRegion) {
1884  L0ParallelGen(CGF, Action);
1885  } else if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_NonSPMD) {
1886  RCG(CGF);
1887  } else {
1888  // Check for master and then parallelism:
1889  // if (__kmpc_is_spmd_exec_mode() || __kmpc_parallel_level(loc, gtid)) {
1890  // Serialized execution.
1891  // } else if (master) {
1892  // Worker call.
1893  // } else {
1894  // Outlined function call.
1895  // }
1896  CGBuilderTy &Bld = CGF.Builder;
1897  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1898  llvm::BasicBlock *SeqBB = CGF.createBasicBlock(".sequential");
1899  llvm::BasicBlock *ParallelCheckBB = CGF.createBasicBlock(".parcheck");
1900  llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
1901  llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
1902  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
1903  Bld.CreateCondBr(IsSPMD, SeqBB, ParallelCheckBB);
1904  // There is no need to emit line number for unconditional branch.
1906  CGF.EmitBlock(ParallelCheckBB);
1907  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1908  llvm::Value *ThreadID = getThreadID(CGF, Loc);
1909  llvm::Value *PL = CGF.EmitRuntimeCall(
1910  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
1911  {RTLoc, ThreadID});
1912  llvm::Value *Res = Bld.CreateIsNotNull(PL);
1913  Bld.CreateCondBr(Res, SeqBB, MasterCheckBB);
1914  CGF.EmitBlock(SeqBB);
1915  SeqGen(CGF, Action);
1916  CGF.EmitBranch(ExitBB);
1917  // There is no need to emit line number for unconditional branch.
1919  CGF.EmitBlock(MasterCheckBB);
1920  llvm::BasicBlock *MasterThenBB = CGF.createBasicBlock("master.then");
1921  llvm::BasicBlock *ElseBlock = CGF.createBasicBlock("omp_if.else");
1922  llvm::Value *IsMaster =
1923  Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
1924  Bld.CreateCondBr(IsMaster, MasterThenBB, ElseBlock);
1925  CGF.EmitBlock(MasterThenBB);
1926  L0ParallelGen(CGF, Action);
1927  CGF.EmitBranch(ExitBB);
1928  // There is no need to emit line number for unconditional branch.
1930  CGF.EmitBlock(ElseBlock);
1931  // In the worker need to use the real thread id.
1932  ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
1933  RCG(CGF);
1934  // There is no need to emit line number for unconditional branch.
1936  // Emit the continuation block for code after the if.
1937  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
1938  }
1939  };
1940 
1941  if (IfCond) {
1942  emitOMPIfClause(CGF, IfCond, LNParallelGen, SeqGen);
1943  } else {
1945  RegionCodeGenTy ThenRCG(LNParallelGen);
1946  ThenRCG(CGF);
1947  }
1948 }
1949 
1950 void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall(
1951  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
1952  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
1953  // Just call the outlined function to execute the parallel region.
1954  // OutlinedFn(&GTid, &zero, CapturedStruct);
1955  //
1956  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1957 
1959  /*DestWidth=*/32, /*Signed=*/1),
1960  ".zero.addr");
1961  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
1962  // ThreadId for serialized parallels is 0.
1963  Address ThreadIDAddr = ZeroAddr;
1964  auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, ZeroAddr,
1965  &ThreadIDAddr](CodeGenFunction &CGF,
1966  PrePostActionTy &Action) {
1967  Action.Enter(CGF);
1968 
1969  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1970  OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
1971  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1972  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1973  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
1974  };
1975  auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
1976  PrePostActionTy &) {
1977 
1978  RegionCodeGenTy RCG(CodeGen);
1979  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1980  llvm::Value *ThreadID = getThreadID(CGF, Loc);
1981  llvm::Value *Args[] = {RTLoc, ThreadID};
1982 
1983  NVPTXActionTy Action(
1984  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
1985  Args,
1986  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
1987  Args);
1988  RCG.setAction(Action);
1989  RCG(CGF);
1990  };
1991 
1992  if (IsInTargetMasterThreadRegion) {
1993  // In the worker need to use the real thread id.
1994  ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
1995  RegionCodeGenTy RCG(CodeGen);
1996  RCG(CGF);
1997  } else {
1998  // If we are not in the target region, it is definitely L2 parallelism or
1999  // more, because for SPMD mode we always has L1 parallel level, sowe don't
2000  // need to check for orphaned directives.
2001  RegionCodeGenTy RCG(SeqGen);
2002  RCG(CGF);
2003  }
2004 }
2005 
2007  CodeGenFunction &CGF, StringRef CriticalName,
2008  const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
2009  const Expr *Hint) {
2010  llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
2011  llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
2012  llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
2013  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
2014  llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
2015 
2016  // Fetch team-local id of the thread.
2017  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
2018 
2019  // Get the width of the team.
2020  llvm::Value *TeamWidth = getNVPTXNumThreads(CGF);
2021 
2022  // Initialize the counter variable for the loop.
2023  QualType Int32Ty =
2024  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
2025  Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
2026  LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
2027  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
2028  /*isInit=*/true);
2029 
2030  // Block checks if loop counter exceeds upper bound.
2031  CGF.EmitBlock(LoopBB);
2032  llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2033  llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
2034  CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
2035 
2036  // Block tests which single thread should execute region, and which threads
2037  // should go straight to synchronisation point.
2038  CGF.EmitBlock(TestBB);
2039  CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2040  llvm::Value *CmpThreadToCounter =
2041  CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
2042  CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
2043 
2044  // Block emits the body of the critical region.
2045  CGF.EmitBlock(BodyBB);
2046 
2047  // Output the critical statement.
2048  CriticalOpGen(CGF);
2049 
2050  // After the body surrounded by the critical region, the single executing
2051  // thread will jump to the synchronisation point.
2052  // Block waits for all threads in current team to finish then increments the
2053  // counter variable and returns to the loop.
2054  CGF.EmitBlock(SyncBB);
2055  getNVPTXCTABarrier(CGF);
2056 
2057  llvm::Value *IncCounterVal =
2058  CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
2059  CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
2060  CGF.EmitBranch(LoopBB);
2061 
2062  // Block that is reached when all threads in the team complete the region.
2063  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2064 }
2065 
2066 /// Cast value to the specified type.
2068  QualType ValTy, QualType CastTy,
2069  SourceLocation Loc) {
2070  assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
2071  "Cast type must sized.");
2072  assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
2073  "Val type must sized.");
2074  llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
2075  if (ValTy == CastTy)
2076  return Val;
2077  if (CGF.getContext().getTypeSizeInChars(ValTy) ==
2078  CGF.getContext().getTypeSizeInChars(CastTy))
2079  return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
2080  if (CastTy->isIntegerType() && ValTy->isIntegerType())
2081  return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
2082  CastTy->hasSignedIntegerRepresentation());
2083  Address CastItem = CGF.CreateMemTemp(CastTy);
2085  CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
2086  CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy);
2087  return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc);
2088 }
2089 
2090 /// This function creates calls to one of two shuffle functions to copy
2091 /// variables between lanes in a warp.
2093  llvm::Value *Elem,
2094  QualType ElemType,
2096  SourceLocation Loc) {
2097  CodeGenModule &CGM = CGF.CGM;
2098  CGBuilderTy &Bld = CGF.Builder;
2099  CGOpenMPRuntimeNVPTX &RT =
2100  *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));
2101 
2102  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2103  assert(Size.getQuantity() <= 8 &&
2104  "Unsupported bitwidth in shuffle instruction.");
2105 
2106  OpenMPRTLFunctionNVPTX ShuffleFn = Size.getQuantity() <= 4
2107  ? OMPRTL_NVPTX__kmpc_shuffle_int32
2108  : OMPRTL_NVPTX__kmpc_shuffle_int64;
2109 
2110  // Cast all types to 32- or 64-bit values before calling shuffle routines.
2111  QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
2112  Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
2113  llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
2114  llvm::Value *WarpSize =
2115  Bld.CreateIntCast(getNVPTXWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
2116 
2117  llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
2118  RT.createNVPTXRuntimeFunction(ShuffleFn), {ElemCast, Offset, WarpSize});
2119 
2120  return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
2121 }
2122 
2123 static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
2124  Address DestAddr, QualType ElemType,
2126  CGBuilderTy &Bld = CGF.Builder;
2127 
2128  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2129  // Create the loop over the big sized data.
2130  // ptr = (void*)Elem;
2131  // ptrEnd = (void*) Elem + 1;
2132  // Step = 8;
2133  // while (ptr + Step < ptrEnd)
2134  // shuffle((int64_t)*ptr);
2135  // Step = 4;
2136  // while (ptr + Step < ptrEnd)
2137  // shuffle((int32_t)*ptr);
2138  // ...
2139  Address ElemPtr = DestAddr;
2140  Address Ptr = SrcAddr;
2142  Bld.CreateConstGEP(SrcAddr, 1, Size), CGF.VoidPtrTy);
2143  for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
2144  if (Size < CharUnits::fromQuantity(IntSize))
2145  continue;
2146  QualType IntType = CGF.getContext().getIntTypeForBitwidth(
2147  CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
2148  /*Signed=*/1);
2149  llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
2150  Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
2151  ElemPtr =
2152  Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
2153  if (Size.getQuantity() / IntSize > 1) {
2154  llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
2155  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
2156  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
2157  llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
2158  CGF.EmitBlock(PreCondBB);
2159  llvm::PHINode *PhiSrc =
2160  Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
2161  PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
2162  llvm::PHINode *PhiDest =
2163  Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
2164  PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
2165  Ptr = Address(PhiSrc, Ptr.getAlignment());
2166  ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
2167  llvm::Value *PtrDiff = Bld.CreatePtrDiff(
2169  Ptr.getPointer(), CGF.VoidPtrTy));
2170  Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
2171  ThenBB, ExitBB);
2172  CGF.EmitBlock(ThenBB);
2174  CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2175  IntType, Offset, Loc);
2176  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2177  Ptr = Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize));
2178  ElemPtr =
2179  Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize));
2180  PhiSrc->addIncoming(Ptr.getPointer(), ThenBB);
2181  PhiDest->addIncoming(ElemPtr.getPointer(), ThenBB);
2182  CGF.EmitBranch(PreCondBB);
2183  CGF.EmitBlock(ExitBB);
2184  } else {
2186  CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2187  IntType, Offset, Loc);
2188  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2189  Ptr = Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize));
2190  ElemPtr =
2191  Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize));
2192  }
2193  Size = Size % IntSize;
2194  }
2195 }
2196 
2197 namespace {
2198 enum CopyAction : unsigned {
2199  // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
2200  // the warp using shuffle instructions.
2201  RemoteLaneToThread,
2202  // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
2203  ThreadCopy,
2204  // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
2205  ThreadToScratchpad,
2206  // ScratchpadToThread: Copy from a scratchpad array in global memory
2207  // containing team-reduced data to a thread's stack.
2208  ScratchpadToThread,
2209 };
2210 } // namespace
2211 
2216 };
2217 
2218 /// Emit instructions to copy a Reduce list, which contains partially
2219 /// aggregated values, in the specified direction.
2221  CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
2222  ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
2223  CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
2224 
2225  CodeGenModule &CGM = CGF.CGM;
2226  ASTContext &C = CGM.getContext();
2227  CGBuilderTy &Bld = CGF.Builder;
2228 
2229  llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2230  llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
2231  llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
2232 
2233  // Iterates, element-by-element, through the source Reduce list and
2234  // make a copy.
2235  unsigned Idx = 0;
2236  unsigned Size = Privates.size();
2237  for (const Expr *Private : Privates) {
2238  Address SrcElementAddr = Address::invalid();
2239  Address DestElementAddr = Address::invalid();
2240  Address DestElementPtrAddr = Address::invalid();
2241  // Should we shuffle in an element from a remote lane?
2242  bool ShuffleInElement = false;
2243  // Set to true to update the pointer in the dest Reduce list to a
2244  // newly created element.
2245  bool UpdateDestListPtr = false;
2246  // Increment the src or dest pointer to the scratchpad, for each
2247  // new element.
2248  bool IncrScratchpadSrc = false;
2249  bool IncrScratchpadDest = false;
2250 
2251  switch (Action) {
2252  case RemoteLaneToThread: {
2253  // Step 1.1: Get the address for the src element in the Reduce list.
2254  Address SrcElementPtrAddr =
2255  Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
2256  SrcElementAddr = CGF.EmitLoadOfPointer(
2257  SrcElementPtrAddr,
2258  C.getPointerType(Private->getType())->castAs<PointerType>());
2259 
2260  // Step 1.2: Create a temporary to store the element in the destination
2261  // Reduce list.
2262  DestElementPtrAddr =
2263  Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
2264  DestElementAddr =
2265  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2266  ShuffleInElement = true;
2267  UpdateDestListPtr = true;
2268  break;
2269  }
2270  case ThreadCopy: {
2271  // Step 1.1: Get the address for the src element in the Reduce list.
2272  Address SrcElementPtrAddr =
2273  Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
2274  SrcElementAddr = CGF.EmitLoadOfPointer(
2275  SrcElementPtrAddr,
2276  C.getPointerType(Private->getType())->castAs<PointerType>());
2277 
2278  // Step 1.2: Get the address for dest element. The destination
2279  // element has already been created on the thread's stack.
2280  DestElementPtrAddr =
2281  Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
2282  DestElementAddr = CGF.EmitLoadOfPointer(
2283  DestElementPtrAddr,
2284  C.getPointerType(Private->getType())->castAs<PointerType>());
2285  break;
2286  }
2287  case ThreadToScratchpad: {
2288  // Step 1.1: Get the address for the src element in the Reduce list.
2289  Address SrcElementPtrAddr =
2290  Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
2291  SrcElementAddr = CGF.EmitLoadOfPointer(
2292  SrcElementPtrAddr,
2293  C.getPointerType(Private->getType())->castAs<PointerType>());
2294 
2295  // Step 1.2: Get the address for dest element:
2296  // address = base + index * ElementSizeInChars.
2297  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2298  llvm::Value *CurrentOffset =
2299  Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
2300  llvm::Value *ScratchPadElemAbsolutePtrVal =
2301  Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
2302  ScratchPadElemAbsolutePtrVal =
2303  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
2304  DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2305  C.getTypeAlignInChars(Private->getType()));
2306  IncrScratchpadDest = true;
2307  break;
2308  }
2309  case ScratchpadToThread: {
2310  // Step 1.1: Get the address for the src element in the scratchpad.
2311  // address = base + index * ElementSizeInChars.
2312  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2313  llvm::Value *CurrentOffset =
2314  Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
2315  llvm::Value *ScratchPadElemAbsolutePtrVal =
2316  Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
2317  ScratchPadElemAbsolutePtrVal =
2318  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
2319  SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2320  C.getTypeAlignInChars(Private->getType()));
2321  IncrScratchpadSrc = true;
2322 
2323  // Step 1.2: Create a temporary to store the element in the destination
2324  // Reduce list.
2325  DestElementPtrAddr =
2326  Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
2327  DestElementAddr =
2328  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2329  UpdateDestListPtr = true;
2330  break;
2331  }
2332  }
2333 
2334  // Regardless of src and dest of copy, we emit the load of src
2335  // element as this is required in all directions
2336  SrcElementAddr = Bld.CreateElementBitCast(
2337  SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
2338  DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
2339  SrcElementAddr.getElementType());
2340 
2341  // Now that all active lanes have read the element in the
2342  // Reduce list, shuffle over the value from the remote lane.
2343  if (ShuffleInElement) {
2344  shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
2345  RemoteLaneOffset, Private->getExprLoc());
2346  } else {
2347  if (Private->getType()->isScalarType()) {
2348  llvm::Value *Elem =
2349  CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
2350  Private->getType(), Private->getExprLoc());
2351  // Store the source element value to the dest element address.
2352  CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
2353  Private->getType());
2354  } else {
2355  CGF.EmitAggregateCopy(
2356  CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
2357  CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
2359  }
2360  }
2361 
2362  // Step 3.1: Modify reference in dest Reduce list as needed.
2363  // Modifying the reference in Reduce list to point to the newly
2364  // created element. The element is live in the current function
2365  // scope and that of functions it invokes (i.e., reduce_function).
2366  // RemoteReduceData[i] = (void*)&RemoteElem
2367  if (UpdateDestListPtr) {
2369  DestElementAddr.getPointer(), CGF.VoidPtrTy),
2370  DestElementPtrAddr, /*Volatile=*/false,
2371  C.VoidPtrTy);
2372  }
2373 
2374  // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
2375  // address of the next element in scratchpad memory, unless we're currently
2376  // processing the last one. Memory alignment is also taken care of here.
2377  if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
2378  llvm::Value *ScratchpadBasePtr =
2379  IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
2380  llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2381  ScratchpadBasePtr = Bld.CreateNUWAdd(
2382  ScratchpadBasePtr,
2383  Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
2384 
2385  // Take care of global memory alignment for performance
2386  ScratchpadBasePtr = Bld.CreateNUWSub(
2387  ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2388  ScratchpadBasePtr = Bld.CreateUDiv(
2389  ScratchpadBasePtr,
2390  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2391  ScratchpadBasePtr = Bld.CreateNUWAdd(
2392  ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2393  ScratchpadBasePtr = Bld.CreateNUWMul(
2394  ScratchpadBasePtr,
2395  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2396 
2397  if (IncrScratchpadDest)
2398  DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2399  else /* IncrScratchpadSrc = true */
2400  SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2401  }
2402 
2403  ++Idx;
2404  }
2405 }
2406 
2407 /// This function emits a helper that loads data from the scratchpad array
2408 /// and (optionally) reduces it with the input operand.
2409 ///
2410 /// load_and_reduce(local, scratchpad, index, width, should_reduce)
2411 /// reduce_data remote;
2412 /// for elem in remote:
2413 /// remote.elem = Scratchpad[elem_id][index]
2414 /// if (should_reduce)
2415 /// local = local @ remote
2416 /// else
2417 /// local = remote
2419  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2420  QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) {
2421  ASTContext &C = CGM.getContext();
2422  QualType Int32Ty = C.getIntTypeForBitwidth(32, /*Signed=*/1);
2423 
2424  // Destination of the copy.
2425  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2427  // Base address of the scratchpad array, with each element storing a
2428  // Reduce list per team.
2429  ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2431  // A source index into the scratchpad array.
2432  ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
2434  // Row width of an element in the scratchpad array, typically
2435  // the number of teams.
2436  ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
2438  // If should_reduce == 1, then it's load AND reduce,
2439  // If should_reduce == 0 (or otherwise), then it only loads (+ copy).
2440  // The latter case is used for initialization.
2441  ImplicitParamDecl ShouldReduceArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2442  Int32Ty, ImplicitParamDecl::Other);
2443 
2444  FunctionArgList Args;
2445  Args.push_back(&ReduceListArg);
2446  Args.push_back(&ScratchPadArg);
2447  Args.push_back(&IndexArg);
2448  Args.push_back(&WidthArg);
2449  Args.push_back(&ShouldReduceArg);
2450 
2451  const CGFunctionInfo &CGFI =
2453  auto *Fn = llvm::Function::Create(
2455  "_omp_reduction_load_and_reduce", &CGM.getModule());
2456  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2457  Fn->setDoesNotRecurse();
2458  CodeGenFunction CGF(CGM);
2459  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2460 
2461  CGBuilderTy &Bld = CGF.Builder;
2462 
2463  // Get local Reduce list pointer.
2464  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2465  Address ReduceListAddr(
2467  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2468  C.VoidPtrTy, Loc),
2469  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2470  CGF.getPointerAlign());
2471 
2472  Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
2473  llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
2474  AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2475 
2476  Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
2477  llvm::Value *IndexVal = Bld.CreateIntCast(
2478  CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, Int32Ty, Loc),
2479  CGM.SizeTy, /*isSigned=*/true);
2480 
2481  Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
2482  llvm::Value *WidthVal = Bld.CreateIntCast(
2483  CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false, Int32Ty, Loc),
2484  CGM.SizeTy, /*isSigned=*/true);
2485 
2486  Address AddrShouldReduceArg = CGF.GetAddrOfLocalVar(&ShouldReduceArg);
2487  llvm::Value *ShouldReduceVal = CGF.EmitLoadOfScalar(
2488  AddrShouldReduceArg, /*Volatile=*/false, Int32Ty, Loc);
2489 
2490  // The absolute ptr address to the base addr of the next element to copy.
2491  llvm::Value *CumulativeElemBasePtr =
2492  Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
2493  Address SrcDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
2494 
2495  // Create a Remote Reduce list to store the elements read from the
2496  // scratchpad array.
2497  Address RemoteReduceList =
2498  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_red_list");
2499 
2500  // Assemble remote Reduce list from scratchpad array.
2501  emitReductionListCopy(ScratchpadToThread, CGF, ReductionArrayTy, Privates,
2502  SrcDataAddr, RemoteReduceList,
2503  {/*RemoteLaneOffset=*/nullptr,
2504  /*ScratchpadIndex=*/IndexVal,
2505  /*ScratchpadWidth=*/WidthVal});
2506 
2507  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2508  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2509  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2510 
2511  llvm::Value *CondReduce = Bld.CreateIsNotNull(ShouldReduceVal);
2512  Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
2513 
2514  CGF.EmitBlock(ThenBB);
2515  // We should reduce with the local Reduce list.
2516  // reduce_function(LocalReduceList, RemoteReduceList)
2518  ReduceListAddr.getPointer(), CGF.VoidPtrTy);
2519  llvm::Value *RemoteDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2520  RemoteReduceList.getPointer(), CGF.VoidPtrTy);
2521  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2522  CGF, Loc, ReduceFn, {LocalDataPtr, RemoteDataPtr});
2523  Bld.CreateBr(MergeBB);
2524 
2525  CGF.EmitBlock(ElseBB);
2526  // No reduction; just copy:
2527  // Local Reduce list = Remote Reduce list.
2528  emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
2529  RemoteReduceList, ReduceListAddr);
2530  Bld.CreateBr(MergeBB);
2531 
2532  CGF.EmitBlock(MergeBB);
2533 
2534  CGF.FinishFunction();
2535  return Fn;
2536 }
2537 
2538 /// This function emits a helper that stores reduced data from the team
2539 /// master to a scratchpad array in global memory.
2540 ///
2541 /// for elem in Reduce List:
2542 /// scratchpad[elem_id][index] = elem
2543 ///
2545  ArrayRef<const Expr *> Privates,
2546  QualType ReductionArrayTy,
2547  SourceLocation Loc) {
2548 
2549  ASTContext &C = CGM.getContext();
2550  QualType Int32Ty = C.getIntTypeForBitwidth(32, /*Signed=*/1);
2551 
2552  // Source of the copy.
2553  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2555  // Base address of the scratchpad array, with each element storing a
2556  // Reduce list per team.
2557  ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2559  // A destination index into the scratchpad array, typically the team
2560  // identifier.
2561  ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
2563  // Row width of an element in the scratchpad array, typically
2564  // the number of teams.
2565  ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
2567 
2568  FunctionArgList Args;
2569  Args.push_back(&ReduceListArg);
2570  Args.push_back(&ScratchPadArg);
2571  Args.push_back(&IndexArg);
2572  Args.push_back(&WidthArg);
2573 
2574  const CGFunctionInfo &CGFI =
2576  auto *Fn = llvm::Function::Create(
2578  "_omp_reduction_copy_to_scratchpad", &CGM.getModule());
2579  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2580  Fn->setDoesNotRecurse();
2581  CodeGenFunction CGF(CGM);
2582  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2583 
2584  CGBuilderTy &Bld = CGF.Builder;
2585 
2586  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2587  Address SrcDataAddr(
2589  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2590  C.VoidPtrTy, Loc),
2591  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2592  CGF.getPointerAlign());
2593 
2594  Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
2595  llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
2596  AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2597 
2598  Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
2599  llvm::Value *IndexVal = Bld.CreateIntCast(
2600  CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, Int32Ty, Loc),
2601  CGF.SizeTy, /*isSigned=*/true);
2602 
2603  Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
2604  llvm::Value *WidthVal =
2605  Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,
2606  Int32Ty, SourceLocation()),
2607  CGF.SizeTy, /*isSigned=*/true);
2608 
2609  // The absolute ptr address to the base addr of the next element to copy.
2610  llvm::Value *CumulativeElemBasePtr =
2611  Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
2612  Address DestDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
2613 
2614  emitReductionListCopy(ThreadToScratchpad, CGF, ReductionArrayTy, Privates,
2615  SrcDataAddr, DestDataAddr,
2616  {/*RemoteLaneOffset=*/nullptr,
2617  /*ScratchpadIndex=*/IndexVal,
2618  /*ScratchpadWidth=*/WidthVal});
2619 
2620  CGF.FinishFunction();
2621  return Fn;
2622 }
2623 
2624 /// This function emits a helper that gathers Reduce lists from the first
2625 /// lane of every active warp to lanes in the first warp.
2626 ///
2627 /// void inter_warp_copy_func(void* reduce_data, num_warps)
2628 /// shared smem[warp_size];
2629 /// For all data entries D in reduce_data:
2630 /// If (I am the first lane in each warp)
2631 /// Copy my local D to smem[warp_id]
2632 /// sync
2633 /// if (I am the first warp)
2634 /// Copy smem[thread_id] to my local D
2635 /// sync
2637  ArrayRef<const Expr *> Privates,
2638  QualType ReductionArrayTy,
2639  SourceLocation Loc) {
2640  ASTContext &C = CGM.getContext();
2641  llvm::Module &M = CGM.getModule();
2642 
2643  // ReduceList: thread local Reduce list.
2644  // At the stage of the computation when this function is called, partially
2645  // aggregated values reside in the first lane of every active warp.
2646  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2648  // NumWarps: number of warps active in the parallel region. This could
2649  // be smaller than 32 (max warps in a CTA) for partial block reduction.
2650  ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2651  C.getIntTypeForBitwidth(32, /* Signed */ true),
2653  FunctionArgList Args;
2654  Args.push_back(&ReduceListArg);
2655  Args.push_back(&NumWarpsArg);
2656 
2657  const CGFunctionInfo &CGFI =
2659  auto *Fn = llvm::Function::Create(
2661  "_omp_reduction_inter_warp_copy_func", &CGM.getModule());
2662  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2663  Fn->setDoesNotRecurse();
2664  CodeGenFunction CGF(CGM);
2665  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2666 
2667  CGBuilderTy &Bld = CGF.Builder;
2668 
2669  // This array is used as a medium to transfer, one reduce element at a time,
2670  // the data from the first lane of every warp to lanes in the first warp
2671  // in order to perform the final step of a reduction in a parallel region
2672  // (reduction across warps). The array is placed in NVPTX __shared__ memory
2673  // for reduced latency, as well as to have a distinct copy for concurrently
2674  // executing target regions. The array is declared with common linkage so
2675  // as to be shared across compilation units.
2676  StringRef TransferMediumName =
2677  "__openmp_nvptx_data_transfer_temporary_storage";
2678  llvm::GlobalVariable *TransferMedium =
2679  M.getGlobalVariable(TransferMediumName);
2680  if (!TransferMedium) {
2681  auto *Ty = llvm::ArrayType::get(CGM.Int64Ty, WarpSize);
2682  unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
2683  TransferMedium = new llvm::GlobalVariable(
2684  M, Ty,
2685  /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
2686  llvm::Constant::getNullValue(Ty), TransferMediumName,
2687  /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
2688  SharedAddressSpace);
2689  CGM.addCompilerUsedGlobal(TransferMedium);
2690  }
2691 
2692  // Get the CUDA thread id of the current OpenMP thread on the GPU.
2693  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
2694  // nvptx_lane_id = nvptx_id % warpsize
2695  llvm::Value *LaneID = getNVPTXLaneID(CGF);
2696  // nvptx_warp_id = nvptx_id / warpsize
2697  llvm::Value *WarpID = getNVPTXWarpID(CGF);
2698 
2699  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2700  Address LocalReduceList(
2702  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2703  C.VoidPtrTy, SourceLocation()),
2704  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2705  CGF.getPointerAlign());
2706 
2707  unsigned Idx = 0;
2708  for (const Expr *Private : Privates) {
2709  //
2710  // Warp master copies reduce element to transfer medium in __shared__
2711  // memory.
2712  //
2713  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2714  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2715  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2716 
2717  // if (lane_id == 0)
2718  llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
2719  Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2720  CGF.EmitBlock(ThenBB);
2721 
2722  // Reduce element = LocalReduceList[i]
2723  Address ElemPtrPtrAddr =
2724  Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
2725  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2726  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2727  // elemptr = (type[i]*)(elemptrptr)
2728  Address ElemPtr =
2729  Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
2730  ElemPtr = Bld.CreateElementBitCast(
2731  ElemPtr, CGF.ConvertTypeForMem(Private->getType()));
2732 
2733  // Get pointer to location in transfer medium.
2734  // MediumPtr = &medium[warp_id]
2735  llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
2736  TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
2737  Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType()));
2738  // Casting to actual data type.
2739  // MediumPtr = (type[i]*)MediumPtrAddr;
2740  MediumPtr = Bld.CreateElementBitCast(
2741  MediumPtr, CGF.ConvertTypeForMem(Private->getType()));
2742 
2743  // elem = *elemptr
2744  //*MediumPtr = elem
2745  if (Private->getType()->isScalarType()) {
2746  llvm::Value *Elem = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false,
2747  Private->getType(), Loc);
2748  // Store the source element value to the dest element address.
2749  CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/false,
2750  Private->getType());
2751  } else {
2752  CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2753  CGF.MakeAddrLValue(MediumPtr, Private->getType()),
2755  }
2756 
2757  Bld.CreateBr(MergeBB);
2758 
2759  CGF.EmitBlock(ElseBB);
2760  Bld.CreateBr(MergeBB);
2761 
2762  CGF.EmitBlock(MergeBB);
2763 
2764  Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
2765  llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
2766  AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, SourceLocation());
2767 
2768  llvm::Value *NumActiveThreads = Bld.CreateNSWMul(
2769  NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads");
2770  // named_barrier_sync(ParallelBarrierID, num_active_threads)
2771  syncParallelThreads(CGF, NumActiveThreads);
2772 
2773  //
2774  // Warp 0 copies reduce element from transfer medium.
2775  //
2776  llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
2777  llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
2778  llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
2779 
2780  // Up to 32 threads in warp 0 are active.
2781  llvm::Value *IsActiveThread =
2782  Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
2783  Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2784 
2785  CGF.EmitBlock(W0ThenBB);
2786 
2787  // SrcMediumPtr = &medium[tid]
2788  llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
2789  TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
2790  Address SrcMediumPtr(SrcMediumPtrVal,
2791  C.getTypeAlignInChars(Private->getType()));
2792  // SrcMediumVal = *SrcMediumPtr;
2793  SrcMediumPtr = Bld.CreateElementBitCast(
2794  SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType()));
2795 
2796  // TargetElemPtr = (type[i]*)(SrcDataAddr[i])
2797  Address TargetElemPtrPtr =
2798  Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
2799  llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
2800  TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2801  Address TargetElemPtr =
2802  Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType()));
2803  TargetElemPtr = Bld.CreateElementBitCast(
2804  TargetElemPtr, CGF.ConvertTypeForMem(Private->getType()));
2805 
2806  // *TargetElemPtr = SrcMediumVal;
2807  if (Private->getType()->isScalarType()) {
2808  llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar(
2809  SrcMediumPtr, /*Volatile=*/false, Private->getType(), Loc);
2810  CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
2811  Private->getType());
2812  } else {
2813  CGF.EmitAggregateCopy(
2814  CGF.MakeAddrLValue(SrcMediumPtr, Private->getType()),
2815  CGF.MakeAddrLValue(TargetElemPtr, Private->getType()),
2817  }
2818  Bld.CreateBr(W0MergeBB);
2819 
2820  CGF.EmitBlock(W0ElseBB);
2821  Bld.CreateBr(W0MergeBB);
2822 
2823  CGF.EmitBlock(W0MergeBB);
2824 
2825  // While warp 0 copies values from transfer medium, all other warps must
2826  // wait.
2827  syncParallelThreads(CGF, NumActiveThreads);
2828  ++Idx;
2829  }
2830 
2831  CGF.FinishFunction();
2832  return Fn;
2833 }
2834 
2835 /// Emit a helper that reduces data across two OpenMP threads (lanes)
2836 /// in the same warp. It uses shuffle instructions to copy over data from
2837 /// a remote lane's stack. The reduction algorithm performed is specified
2838 /// by the fourth parameter.
2839 ///
2840 /// Algorithm Versions.
2841 /// Full Warp Reduce (argument value 0):
2842 /// This algorithm assumes that all 32 lanes are active and gathers
2843 /// data from these 32 lanes, producing a single resultant value.
2844 /// Contiguous Partial Warp Reduce (argument value 1):
2845 /// This algorithm assumes that only a *contiguous* subset of lanes
2846 /// are active. This happens for the last warp in a parallel region
2847 /// when the user specified num_threads is not an integer multiple of
2848 /// 32. This contiguous subset always starts with the zeroth lane.
2849 /// Partial Warp Reduce (argument value 2):
2850 /// This algorithm gathers data from any number of lanes at any position.
2851 /// All reduced values are stored in the lowest possible lane. The set
2852 /// of problems every algorithm addresses is a super set of those
2853 /// addressable by algorithms with a lower version number. Overhead
2854 /// increases as algorithm version increases.
2855 ///
2856 /// Terminology
2857 /// Reduce element:
2858 /// Reduce element refers to the individual data field with primitive
2859 /// data types to be combined and reduced across threads.
2860 /// Reduce list:
2861 /// Reduce list refers to a collection of local, thread-private
2862 /// reduce elements.
2863 /// Remote Reduce list:
2864 /// Remote Reduce list refers to a collection of remote (relative to
2865 /// the current thread) reduce elements.
2866 ///
2867 /// We distinguish between three states of threads that are important to
2868 /// the implementation of this function.
2869 /// Alive threads:
2870 /// Threads in a warp executing the SIMT instruction, as distinguished from
2871 /// threads that are inactive due to divergent control flow.
2872 /// Active threads:
2873 /// The minimal set of threads that has to be alive upon entry to this
2874 /// function. The computation is correct iff active threads are alive.
2875 /// Some threads are alive but they are not active because they do not
2876 /// contribute to the computation in any useful manner. Turning them off
2877 /// may introduce control flow overheads without any tangible benefits.
2878 /// Effective threads:
2879 /// In order to comply with the argument requirements of the shuffle
2880 /// function, we must keep all lanes holding data alive. But at most
2881 /// half of them perform value aggregation; we refer to this half of
2882 /// threads as effective. The other half is simply handing off their
2883 /// data.
2884 ///
2885 /// Procedure
2886 /// Value shuffle:
2887 /// In this step active threads transfer data from higher lane positions
2888 /// in the warp to lower lane positions, creating Remote Reduce list.
2889 /// Value aggregation:
2890 /// In this step, effective threads combine their thread local Reduce list
2891 /// with Remote Reduce list and store the result in the thread local
2892 /// Reduce list.
2893 /// Value copy:
2894 /// In this step, we deal with the assumption made by algorithm 2
2895 /// (i.e. contiguity assumption). When we have an odd number of lanes
2896 /// active, say 2k+1, only k threads will be effective and therefore k
2897 /// new values will be produced. However, the Reduce list owned by the
2898 /// (2k+1)th thread is ignored in the value aggregation. Therefore
2899 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
2900 /// that the contiguity assumption still holds.
2902  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2903  QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) {
2904  ASTContext &C = CGM.getContext();
2905 
2906  // Thread local Reduce list used to host the values of data to be reduced.
2907  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2909  // Current lane id; could be logical.
2910  ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
2912  // Offset of the remote source lane relative to the current lane.
2913  ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2915  // Algorithm version. This is expected to be known at compile time.
2916  ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2918  FunctionArgList Args;
2919  Args.push_back(&ReduceListArg);
2920  Args.push_back(&LaneIDArg);
2921  Args.push_back(&RemoteLaneOffsetArg);
2922  Args.push_back(&AlgoVerArg);
2923 
2924  const CGFunctionInfo &CGFI =
2926  auto *Fn = llvm::Function::Create(
2928  "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
2929  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2930  Fn->setDoesNotRecurse();
2931  CodeGenFunction CGF(CGM);
2932  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2933 
2934  CGBuilderTy &Bld = CGF.Builder;
2935 
2936  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2937  Address LocalReduceList(
2939  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2940  C.VoidPtrTy, SourceLocation()),
2941  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2942  CGF.getPointerAlign());
2943 
2944  Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2945  llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2946  AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2947 
2948  Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2949  llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2950  AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2951 
2952  Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2953  llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2954  AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2955 
2956  // Create a local thread-private variable to host the Reduce list
2957  // from a remote lane.
2958  Address RemoteReduceList =
2959  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
2960 
2961  // This loop iterates through the list of reduce elements and copies,
2962  // element by element, from a remote lane in the warp to RemoteReduceList,
2963  // hosted on the thread's stack.
2964  emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
2965  LocalReduceList, RemoteReduceList,
2966  {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
2967  /*ScratchpadIndex=*/nullptr,
2968  /*ScratchpadWidth=*/nullptr});
2969 
2970  // The actions to be performed on the Remote Reduce list is dependent
2971  // on the algorithm version.
2972  //
2973  // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2974  // LaneId % 2 == 0 && Offset > 0):
2975  // do the reduction value aggregation
2976  //
2977  // The thread local variable Reduce list is mutated in place to host the
2978  // reduced data, which is the aggregated value produced from local and
2979  // remote lanes.
2980  //
2981  // Note that AlgoVer is expected to be a constant integer known at compile
2982  // time.
2983  // When AlgoVer==0, the first conjunction evaluates to true, making
2984  // the entire predicate true during compile time.
2985  // When AlgoVer==1, the second conjunction has only the second part to be
2986  // evaluated during runtime. Other conjunctions evaluates to false
2987  // during compile time.
2988  // When AlgoVer==2, the third conjunction has only the second part to be
2989  // evaluated during runtime. Other conjunctions evaluates to false
2990  // during compile time.
2991  llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
2992 
2993  llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2994  llvm::Value *CondAlgo1 = Bld.CreateAnd(
2995  Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
2996 
2997  llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
2998  llvm::Value *CondAlgo2 = Bld.CreateAnd(
2999  Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
3000  CondAlgo2 = Bld.CreateAnd(
3001  CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
3002 
3003  llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
3004  CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
3005 
3006  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3007  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3008  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3009  Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
3010 
3011  CGF.EmitBlock(ThenBB);
3012  // reduce_function(LocalReduceList, RemoteReduceList)
3013  llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3014  LocalReduceList.getPointer(), CGF.VoidPtrTy);
3015  llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3016  RemoteReduceList.getPointer(), CGF.VoidPtrTy);
3017  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3018  CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
3019  Bld.CreateBr(MergeBB);
3020 
3021  CGF.EmitBlock(ElseBB);
3022  Bld.CreateBr(MergeBB);
3023 
3024  CGF.EmitBlock(MergeBB);
3025 
3026  // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3027  // Reduce list.
3028  Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3029  llvm::Value *CondCopy = Bld.CreateAnd(
3030  Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
3031 
3032  llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
3033  llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
3034  llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
3035  Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3036 
3037  CGF.EmitBlock(CpyThenBB);
3038  emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
3039  RemoteReduceList, LocalReduceList);
3040  Bld.CreateBr(CpyMergeBB);
3041 
3042  CGF.EmitBlock(CpyElseBB);
3043  Bld.CreateBr(CpyMergeBB);
3044 
3045  CGF.EmitBlock(CpyMergeBB);
3046 
3047  CGF.FinishFunction();
3048  return Fn;
3049 }
3050 
3051 ///
3052 /// Design of OpenMP reductions on the GPU
3053 ///
3054 /// Consider a typical OpenMP program with one or more reduction
3055 /// clauses:
3056 ///
3057 /// float foo;
3058 /// double bar;
3059 /// #pragma omp target teams distribute parallel for \
3060 /// reduction(+:foo) reduction(*:bar)
3061 /// for (int i = 0; i < N; i++) {
3062 /// foo += A[i]; bar *= B[i];
3063 /// }
3064 ///
3065 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
3066 /// all teams. In our OpenMP implementation on the NVPTX device an
3067 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
3068 /// within a team are mapped to CUDA threads within a threadblock.
3069 /// Our goal is to efficiently aggregate values across all OpenMP
3070 /// threads such that:
3071 ///
3072 /// - the compiler and runtime are logically concise, and
3073 /// - the reduction is performed efficiently in a hierarchical
3074 /// manner as follows: within OpenMP threads in the same warp,
3075 /// across warps in a threadblock, and finally across teams on
3076 /// the NVPTX device.
3077 ///
3078 /// Introduction to Decoupling
3079 ///
3080 /// We would like to decouple the compiler and the runtime so that the
3081 /// latter is ignorant of the reduction variables (number, data types)
3082 /// and the reduction operators. This allows a simpler interface
3083 /// and implementation while still attaining good performance.
3084 ///
3085 /// Pseudocode for the aforementioned OpenMP program generated by the
3086 /// compiler is as follows:
3087 ///
3088 /// 1. Create private copies of reduction variables on each OpenMP
3089 /// thread: 'foo_private', 'bar_private'
3090 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
3091 /// to it and writes the result in 'foo_private' and 'bar_private'
3092 /// respectively.
3093 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
3094 /// and store the result on the team master:
3095 ///
3096 /// __kmpc_nvptx_parallel_reduce_nowait(...,
3097 /// reduceData, shuffleReduceFn, interWarpCpyFn)
3098 ///
3099 /// where:
3100 /// struct ReduceData {
3101 /// double *foo;
3102 /// double *bar;
3103 /// } reduceData
3104 /// reduceData.foo = &foo_private
3105 /// reduceData.bar = &bar_private
3106 ///
3107 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
3108 /// auxiliary functions generated by the compiler that operate on
3109 /// variables of type 'ReduceData'. They aid the runtime perform
3110 /// algorithmic steps in a data agnostic manner.
3111 ///
3112 /// 'shuffleReduceFn' is a pointer to a function that reduces data
3113 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
3114 /// same warp. It takes the following arguments as input:
3115 ///
3116 /// a. variable of type 'ReduceData' on the calling lane,
3117 /// b. its lane_id,
3118 /// c. an offset relative to the current lane_id to generate a
3119 /// remote_lane_id. The remote lane contains the second
3120 /// variable of type 'ReduceData' that is to be reduced.
3121 /// d. an algorithm version parameter determining which reduction
3122 /// algorithm to use.
3123 ///
3124 /// 'shuffleReduceFn' retrieves data from the remote lane using
3125 /// efficient GPU shuffle intrinsics and reduces, using the
3126 /// algorithm specified by the 4th parameter, the two operands
3127 /// element-wise. The result is written to the first operand.
3128 ///
3129 /// Different reduction algorithms are implemented in different
3130 /// runtime functions, all calling 'shuffleReduceFn' to perform
3131 /// the essential reduction step. Therefore, based on the 4th
3132 /// parameter, this function behaves slightly differently to
3133 /// cooperate with the runtime to ensure correctness under
3134 /// different circumstances.
3135 ///
3136 /// 'InterWarpCpyFn' is a pointer to a function that transfers
3137 /// reduced variables across warps. It tunnels, through CUDA
3138 /// shared memory, the thread-private data of type 'ReduceData'
3139 /// from lane 0 of each warp to a lane in the first warp.
3140 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
3141 /// The last team writes the global reduced value to memory.
3142 ///
3143 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
3144 /// reduceData, shuffleReduceFn, interWarpCpyFn,
3145 /// scratchpadCopyFn, loadAndReduceFn)
3146 ///
3147 /// 'scratchpadCopyFn' is a helper that stores reduced
3148 /// data from the team master to a scratchpad array in
3149 /// global memory.
3150 ///
3151 /// 'loadAndReduceFn' is a helper that loads data from
3152 /// the scratchpad array and reduces it with the input
3153 /// operand.
3154 ///
3155 /// These compiler generated functions hide address
3156 /// calculation and alignment information from the runtime.
3157 /// 5. if ret == 1:
3158 /// The team master of the last team stores the reduced
3159 /// result to the globals in memory.
3160 /// foo += reduceData.foo; bar *= reduceData.bar
3161 ///
3162 ///
3163 /// Warp Reduction Algorithms
3164 ///
3165 /// On the warp level, we have three algorithms implemented in the
3166 /// OpenMP runtime depending on the number of active lanes:
3167 ///
3168 /// Full Warp Reduction
3169 ///
3170 /// The reduce algorithm within a warp where all lanes are active
3171 /// is implemented in the runtime as follows:
3172 ///
3173 /// full_warp_reduce(void *reduce_data,
3174 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3175 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
3176 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
3177 /// }
3178 ///
3179 /// The algorithm completes in log(2, WARPSIZE) steps.
3180 ///
3181 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
3182 /// not used therefore we save instructions by not retrieving lane_id
3183 /// from the corresponding special registers. The 4th parameter, which
3184 /// represents the version of the algorithm being used, is set to 0 to
3185 /// signify full warp reduction.
3186 ///
3187 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3188 ///
3189 /// #reduce_elem refers to an element in the local lane's data structure
3190 /// #remote_elem is retrieved from a remote lane
3191 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3192 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
3193 ///
3194 /// Contiguous Partial Warp Reduction
3195 ///
3196 /// This reduce algorithm is used within a warp where only the first
3197 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
3198 /// number of OpenMP threads in a parallel region is not a multiple of
3199 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
3200 ///
3201 /// void
3202 /// contiguous_partial_reduce(void *reduce_data,
3203 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
3204 /// int size, int lane_id) {
3205 /// int curr_size;
3206 /// int offset;
3207 /// curr_size = size;
3208 /// mask = curr_size/2;
3209 /// while (offset>0) {
3210 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
3211 /// curr_size = (curr_size+1)/2;
3212 /// offset = curr_size/2;
3213 /// }
3214 /// }
3215 ///
3216 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3217 ///
3218 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3219 /// if (lane_id < offset)
3220 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
3221 /// else
3222 /// reduce_elem = remote_elem
3223 ///
3224 /// This algorithm assumes that the data to be reduced are located in a
3225 /// contiguous subset of lanes starting from the first. When there is
3226 /// an odd number of active lanes, the data in the last lane is not
3227 /// aggregated with any other lane's dat but is instead copied over.
3228 ///
3229 /// Dispersed Partial Warp Reduction
3230 ///
3231 /// This algorithm is used within a warp when any discontiguous subset of
3232 /// lanes are active. It is used to implement the reduction operation
3233 /// across lanes in an OpenMP simd region or in a nested parallel region.
3234 ///
3235 /// void
3236 /// dispersed_partial_reduce(void *reduce_data,
3237 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3238 /// int size, remote_id;
3239 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
3240 /// do {
3241 /// remote_id = next_active_lane_id_right_after_me();
3242 /// # the above function returns 0 of no active lane
3243 /// # is present right after the current lane.
3244 /// size = number_of_active_lanes_in_this_warp();
3245 /// logical_lane_id /= 2;
3246 /// ShuffleReduceFn(reduce_data, logical_lane_id,
3247 /// remote_id-1-threadIdx.x, 2);
3248 /// } while (logical_lane_id % 2 == 0 && size > 1);
3249 /// }
3250 ///
3251 /// There is no assumption made about the initial state of the reduction.
3252 /// Any number of lanes (>=1) could be active at any position. The reduction
3253 /// result is returned in the first active lane.
3254 ///
3255 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3256 ///
3257 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3258 /// if (lane_id % 2 == 0 && offset > 0)
3259 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
3260 /// else
3261 /// reduce_elem = remote_elem
3262 ///
3263 ///
3264 /// Intra-Team Reduction
3265 ///
3266 /// This function, as implemented in the runtime call
3267 /// '__kmpc_nvptx_parallel_reduce_nowait', aggregates data across OpenMP
3268 /// threads in a team. It first reduces within a warp using the
3269 /// aforementioned algorithms. We then proceed to gather all such
3270 /// reduced values at the first warp.
3271 ///
3272 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
3273 /// data from each of the "warp master" (zeroth lane of each warp, where
3274 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
3275 /// a mathematical sense) the problem of reduction across warp masters in
3276 /// a block to the problem of warp reduction.
3277 ///
3278 ///
3279 /// Inter-Team Reduction
3280 ///
3281 /// Once a team has reduced its data to a single value, it is stored in
3282 /// a global scratchpad array. Since each team has a distinct slot, this
3283 /// can be done without locking.
3284 ///
3285 /// The last team to write to the scratchpad array proceeds to reduce the
3286 /// scratchpad array. One or more workers in the last team use the helper
3287 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
3288 /// the k'th worker reduces every k'th element.
3289 ///
3290 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait' to
3291 /// reduce across workers and compute a globally reduced value.
3292 ///
3296  ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
3297  if (!CGF.HaveInsertPoint())
3298  return;
3299 
3300  bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
3301  bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
3302  bool SimdReduction = isOpenMPSimdDirective(Options.ReductionKind);
3303  assert((TeamsReduction || ParallelReduction || SimdReduction) &&
3304  "Invalid reduction selection in emitReduction.");
3305 
3306  if (Options.SimpleReduction) {
3307  CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
3308  ReductionOps, Options);
3309  return;
3310  }
3311 
3312  ASTContext &C = CGM.getContext();
3313 
3314  // 1. Build a list of reduction variables.
3315  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3316  auto Size = RHSExprs.size();
3317  for (const Expr *E : Privates) {
3318  if (E->getType()->isVariablyModifiedType())
3319  // Reserve place for array size.
3320  ++Size;
3321  }
3322  llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
3323  QualType ReductionArrayTy =
3325  /*IndexTypeQuals=*/0);
3326  Address ReductionList =
3327  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3328  auto IPriv = Privates.begin();
3329  unsigned Idx = 0;
3330  for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
3331  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
3332  CGF.getPointerSize());
3333  CGF.Builder.CreateStore(
3335  CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
3336  Elem);
3337  if ((*IPriv)->getType()->isVariablyModifiedType()) {
3338  // Store array size.
3339  ++Idx;
3340  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
3341  CGF.getPointerSize());
3342  llvm::Value *Size = CGF.Builder.CreateIntCast(
3343  CGF.getVLASize(
3344  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3345  .NumElts,
3346  CGF.SizeTy, /*isSigned=*/false);
3347  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3348  Elem);
3349  }
3350  }
3351 
3352  // 2. Emit reduce_func().
3353  llvm::Value *ReductionFn = emitReductionFunction(
3354  CGM, Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(),
3355  Privates, LHSExprs, RHSExprs, ReductionOps);
3356 
3357  // 4. Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3358  // RedList, shuffle_reduce_func, interwarp_copy_func);
3359  llvm::Value *ThreadId = getThreadID(CGF, Loc);
3360  llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
3362  ReductionList.getPointer(), CGF.VoidPtrTy);
3363 
3364  llvm::Value *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
3365  CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
3366  llvm::Value *InterWarpCopyFn =
3367  emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
3368 
3369  llvm::Value *Args[] = {ThreadId,
3370  CGF.Builder.getInt32(RHSExprs.size()),
3371  ReductionArrayTySize,
3372  RL,
3373  ShuffleAndReduceFn,
3374  InterWarpCopyFn};
3375 
3376  llvm::Value *Res = nullptr;
3377  if (ParallelReduction)
3378  Res = CGF.EmitRuntimeCall(
3379  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait),
3380  Args);
3381  else if (SimdReduction)
3382  Res = CGF.EmitRuntimeCall(
3383  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_simd_reduce_nowait),
3384  Args);
3385 
3386  if (TeamsReduction) {
3387  llvm::Value *ScratchPadCopyFn =
3388  emitCopyToScratchpad(CGM, Privates, ReductionArrayTy, Loc);
3389  llvm::Value *LoadAndReduceFn = emitReduceScratchpadFunction(
3390  CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
3391 
3392  llvm::Value *Args[] = {ThreadId,
3393  CGF.Builder.getInt32(RHSExprs.size()),
3394  ReductionArrayTySize,
3395  RL,
3396  ShuffleAndReduceFn,
3397  InterWarpCopyFn,
3398  ScratchPadCopyFn,
3399  LoadAndReduceFn};
3400  Res = CGF.EmitRuntimeCall(
3401  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_teams_reduce_nowait),
3402  Args);
3403  }
3404 
3405  // 5. Build switch(res)
3406  llvm::BasicBlock *DefaultBB = CGF.createBasicBlock(".omp.reduction.default");
3407  llvm::SwitchInst *SwInst =
3408  CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1);
3409 
3410  // 6. Build case 1: where we have reduced values in the master
3411  // thread in each team.
3412  // __kmpc_end_reduce{_nowait}(<gtid>);
3413  // break;
3414  llvm::BasicBlock *Case1BB = CGF.createBasicBlock(".omp.reduction.case1");
3415  SwInst->addCase(CGF.Builder.getInt32(1), Case1BB);
3416  CGF.EmitBlock(Case1BB);
3417 
3418  // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3419  llvm::Value *EndArgs[] = {ThreadId};
3420  auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
3421  this](CodeGenFunction &CGF, PrePostActionTy &Action) {
3422  auto IPriv = Privates.begin();
3423  auto ILHS = LHSExprs.begin();
3424  auto IRHS = RHSExprs.begin();
3425  for (const Expr *E : ReductionOps) {
3426  emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
3427  cast<DeclRefExpr>(*IRHS));
3428  ++IPriv;
3429  ++ILHS;
3430  ++IRHS;
3431  }
3432  };
3433  RegionCodeGenTy RCG(CodeGen);
3434  NVPTXActionTy Action(
3435  nullptr, llvm::None,
3436  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
3437  EndArgs);
3438  RCG.setAction(Action);
3439  RCG(CGF);
3440  CGF.EmitBranch(DefaultBB);
3441  CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
3442 }
3443 
3444 const VarDecl *
3446  const VarDecl *NativeParam) const {
3447  if (!NativeParam->getType()->isReferenceType())
3448  return NativeParam;
3449  QualType ArgType = NativeParam->getType();
3450  QualifierCollector QC;
3451  const Type *NonQualTy = QC.strip(ArgType);
3452  QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3453  if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
3454  if (Attr->getCaptureKind() == OMPC_map) {
3455  PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3457  }
3458  }
3459  ArgType = CGM.getContext().getPointerType(PointeeTy);
3460  QC.addRestrict();
3461  enum { NVPTX_local_addr = 5 };
3462  QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
3463  ArgType = QC.apply(CGM.getContext(), ArgType);
3464  if (isa<ImplicitParamDecl>(NativeParam))
3466  CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
3467  NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
3468  return ParmVarDecl::Create(
3469  CGM.getContext(),
3470  const_cast<DeclContext *>(NativeParam->getDeclContext()),
3471  NativeParam->getLocStart(), NativeParam->getLocation(),
3472  NativeParam->getIdentifier(), ArgType,
3473  /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
3474 }
3475 
3476 Address
3478  const VarDecl *NativeParam,
3479  const VarDecl *TargetParam) const {
3480  assert(NativeParam != TargetParam &&
3481  NativeParam->getType()->isReferenceType() &&
3482  "Native arg must not be the same as target arg.");
3483  Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
3484  QualType NativeParamType = NativeParam->getType();
3485  QualifierCollector QC;
3486  const Type *NonQualTy = QC.strip(NativeParamType);
3487  QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3488  unsigned NativePointeeAddrSpace =
3489  CGF.getContext().getTargetAddressSpace(NativePointeeTy);
3490  QualType TargetTy = TargetParam->getType();
3491  llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
3492  LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
3493  // First cast to generic.
3495  TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
3496  /*AddrSpace=*/0));
3497  // Cast from generic to native address space.
3499  TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
3500  NativePointeeAddrSpace));
3501  Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
3502  CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
3503  NativeParamType);
3504  return NativeParamAddr;
3505 }
3506 
3508  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
3509  ArrayRef<llvm::Value *> Args) const {
3510  SmallVector<llvm::Value *, 4> TargetArgs;
3511  TargetArgs.reserve(Args.size());
3512  auto *FnType =
3513  cast<llvm::FunctionType>(OutlinedFn->getType()->getPointerElementType());
3514  for (unsigned I = 0, E = Args.size(); I < E; ++I) {
3515  if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3516  TargetArgs.append(std::next(Args.begin(), I), Args.end());
3517  break;
3518  }
3519  llvm::Type *TargetType = FnType->getParamType(I);
3520  llvm::Value *NativeArg = Args[I];
3521  if (!TargetType->isPointerTy()) {
3522  TargetArgs.emplace_back(NativeArg);
3523  continue;
3524  }
3526  NativeArg,
3527  NativeArg->getType()->getPointerElementType()->getPointerTo());
3528  TargetArgs.emplace_back(
3529  CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
3530  }
3531  CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
3532 }
3533 
3534 /// Emit function which wraps the outline parallel region
3535 /// and controls the arguments which are passed to this function.
3536 /// The wrapper ensures that the outlined function is called
3537 /// with the correct arguments when data is shared.
3538 llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
3539  llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
3540  ASTContext &Ctx = CGM.getContext();
3541  const auto &CS = *D.getCapturedStmt(OMPD_parallel);
3542 
3543  // Create a function that takes as argument the source thread.
3544  FunctionArgList WrapperArgs;
3545  QualType Int16QTy =
3546  Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
3547  QualType Int32QTy =
3548  Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
3549  ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getLocStart(),
3550  /*Id=*/nullptr, Int16QTy,
3552  ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getLocStart(),
3553  /*Id=*/nullptr, Int32QTy,
3555  WrapperArgs.emplace_back(&ParallelLevelArg);
3556  WrapperArgs.emplace_back(&WrapperArg);
3557 
3558  const CGFunctionInfo &CGFI =
3559  CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
3560 
3561  auto *Fn = llvm::Function::Create(
3563  Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
3564  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3565  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
3566  Fn->setDoesNotRecurse();
3567 
3568  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
3569  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
3570  D.getLocStart(), D.getLocStart());
3571 
3572  const auto *RD = CS.getCapturedRecordDecl();
3573  auto CurField = RD->field_begin();
3574 
3575  Address ZeroAddr = CGF.CreateMemTemp(
3576  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
3577  /*Name*/ ".zero.addr");
3578  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
3579  // Get the array of arguments.
3581 
3582  Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
3583  Args.emplace_back(ZeroAddr.getPointer());
3584 
3585  CGBuilderTy &Bld = CGF.Builder;
3586  auto CI = CS.capture_begin();
3587 
3588  // Use global memory for data sharing.
3589  // Handle passing of global args to workers.
3590  Address GlobalArgs =
3591  CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
3592  llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
3593  llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
3594  CGF.EmitRuntimeCall(
3595  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables),
3596  DataSharingArgs);
3597 
3598  // Retrieve the shared variables from the list of references returned
3599  // by the runtime. Pass the variables to the outlined function.
3600  Address SharedArgListAddress = Address::invalid();
3601  if (CS.capture_size() > 0 ||
3603  SharedArgListAddress = CGF.EmitLoadOfPointer(
3604  GlobalArgs, CGF.getContext()
3606  CGF.getContext().VoidPtrTy))
3607  .castAs<PointerType>());
3608  }
3609  unsigned Idx = 0;
3611  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
3612  CGF.getPointerSize());
3613  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3614  Src, CGF.SizeTy->getPointerTo());
3615  llvm::Value *LB = CGF.EmitLoadOfScalar(
3616  TypedAddress,
3617  /*Volatile=*/false,
3619  cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
3620  Args.emplace_back(LB);
3621  ++Idx;
3622  Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
3623  CGF.getPointerSize());
3624  TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3625  Src, CGF.SizeTy->getPointerTo());
3626  llvm::Value *UB = CGF.EmitLoadOfScalar(
3627  TypedAddress,
3628  /*Volatile=*/false,
3630  cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
3631  Args.emplace_back(UB);
3632  ++Idx;
3633  }
3634  if (CS.capture_size() > 0) {
3635  ASTContext &CGFContext = CGF.getContext();
3636  for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3637  QualType ElemTy = CurField->getType();
3638  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx,
3639  CGF.getPointerSize());
3640  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3641  Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
3642  llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3643  /*Volatile=*/false,
3644  CGFContext.getPointerType(ElemTy),
3645  CI->getLocation());
3646  if (CI->capturesVariableByCopy() &&
3647  !CI->getCapturedVar()->getType()->isAnyPointerType()) {
3648  Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3649  CI->getLocation());
3650  }
3651  Args.emplace_back(Arg);
3652  }
3653  }
3654 
3655  emitOutlinedFunctionCall(CGF, D.getLocStart(), OutlinedParallelFn, Args);
3656  CGF.FinishFunction();
3657  return Fn;
3658 }
3659 
3661  const Decl *D) {
3663  return;
3664 
3665  assert(D && "Expected function or captured|block decl.");
3666  assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
3667  "Function is registered already.");
3668  const Stmt *Body = nullptr;
3669  bool NeedToDelayGlobalization = false;
3670  if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
3671  Body = FD->getBody();
3672  } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
3673  Body = BD->getBody();
3674  } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
3675  Body = CD->getBody();
3676  NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
3677  }
3678  if (!Body)
3679  return;
3680  CheckVarsEscapingDeclContext VarChecker(CGF);
3681  VarChecker.Visit(Body);
3682  const RecordDecl *GlobalizedVarsRecord = VarChecker.getGlobalizedRecord();
3683  ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
3684  VarChecker.getEscapedVariableLengthDecls();
3685  if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
3686  return;
3687  auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
3688  I->getSecond().MappedParams =
3689  llvm::make_unique<CodeGenFunction::OMPMapVars>();
3690  I->getSecond().GlobalRecord = GlobalizedVarsRecord;
3691  I->getSecond().EscapedParameters.insert(
3692  VarChecker.getEscapedParameters().begin(),
3693  VarChecker.getEscapedParameters().end());
3694  I->getSecond().EscapedVariableLengthDecls.append(
3695  EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
3696  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
3697  for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3698  assert(VD->isCanonicalDecl() && "Expected canonical declaration");
3699  const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
3700  Data.insert(std::make_pair(VD, std::make_pair(FD, Address::invalid())));
3701  }
3702  if (!NeedToDelayGlobalization) {
3703  emitGenericVarsProlog(CGF, D->getLocStart());
3704  struct GlobalizationScope final : EHScopeStack::Cleanup {
3705  GlobalizationScope() = default;
3706 
3707  void Emit(CodeGenFunction &CGF, Flags flags) override {
3708  static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
3709  .emitGenericVarsEpilog(CGF);
3710  }
3711  };
3712  CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
3713  }
3714 }
3715 
3717  const VarDecl *VD) {
3719  return Address::invalid();
3720 
3721  VD = VD->getCanonicalDecl();
3722  auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
3723  if (I == FunctionGlobalizedDecls.end())
3724  return Address::invalid();
3725  auto VDI = I->getSecond().LocalVarData.find(VD);
3726  if (VDI != I->getSecond().LocalVarData.end())
3727  return VDI->second.second;
3728  if (VD->hasAttrs()) {
3730  E(VD->attr_end());
3731  IT != E; ++IT) {
3732  auto VDI = I->getSecond().LocalVarData.find(
3733  cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
3734  ->getCanonicalDecl());
3735  if (VDI != I->getSecond().LocalVarData.end())
3736  return VDI->second.second;
3737  }
3738  }
3739  return Address::invalid();
3740 }
3741 
3743  FunctionGlobalizedDecls.erase(CGF.CurFn);
3745 }
const CGFunctionInfo & arrangeBuiltinFunctionDeclaration(QualType resultType, const FunctionArgList &args)
A builtin function is a freestanding function using the default C conventions.
Definition: CGCall.cpp:653
RecordDecl * buildImplicitRecord(StringRef Name, RecordDecl::TagKind TK=TTK_Struct) const
Create a new implicit TU-level CXXRecordDecl or RecordDecl declaration.
QualType getAddrSpaceQualType(QualType T, LangAS AddressSpace) const
Return the uniqued reference to the type for an address space qualified type with the specified type ...
const BlockDecl * getBlockDecl() const
Definition: Expr.h:5065
static const Decl * getCanonicalDecl(const Decl *D)
LValue MakeNaturalAlignPointeeAddrLValue(llvm::Value *V, QualType T)
Given a value of type T* that may not be to a complete object, construct an l-value with the natural ...
static llvm::Value * emitCopyToScratchpad(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy, SourceLocation Loc)
This function emits a helper that stores reduced data from the team master to a scratchpad array in g...
Other implicit parameter.
Definition: Decl.h:1495
A class which contains all the information about a particular captured value.
Definition: Decl.h:3864
if(T->getSizeExpr()) TRY_TO(TraverseStmt(T -> getSizeExpr()))
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition: Type.h:2393
CanQualType VoidPtrTy
Definition: ASTContext.h:1032
A (possibly-)qualified type.
Definition: Type.h:655
ArrayRef< OMPClause * > clauses()
Definition: StmtOpenMP.h:262
llvm::Type * ConvertTypeForMem(QualType T)
static llvm::Value * getNVPTXLaneID(CodeGenFunction &CGF)
Get the id of the current lane in the Warp.
static bool hasParallelIfNumThreadsClause(ASTContext &Ctx, const OMPExecutableDirective &D)
Check if the parallel directive has an &#39;if&#39; clause with non-constant or false condition.
Address CreateMemTemp(QualType T, const Twine &Name="tmp", Address *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
Definition: CGExpr.cpp:139
ConstStmtVisitor - This class implements a simple visitor for Stmt subclasses.
Definition: StmtVisitor.h:195
bool HaveInsertPoint() const
HaveInsertPoint - True if an insertion point is defined.
llvm::LLVMContext & getLLVMContext()
void emitSingleReductionCombiner(CodeGenFunction &CGF, const Expr *ReductionOp, const Expr *PrivateRef, const DeclRefExpr *LHS, const DeclRefExpr *RHS)
Emits single reduction combiner.
Address CreateConstGEP(Address Addr, uint64_t Index, CharUnits EltSize, const llvm::Twine &Name="")
Given addr = T* ...
Definition: CGBuilder.h:226
attr_iterator attr_begin() const
Definition: DeclBase.h:501
Stmt - This represents one statement.
Definition: Stmt.h:66
static void getNVPTXBarrier(CodeGenFunction &CGF, int ID, llvm::Value *NumThreads)
Get barrier #ID to synchronize selected (multiple of warp size) threads in a CTA. ...
llvm::Value * emitParallelOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP parallel.
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:86
specific_attr_iterator - Iterates over a subrange of an AttrVec, only providing attributes that are o...
Definition: AttrIterator.h:54
Address getParameterAddress(CodeGenFunction &CGF, const VarDecl *NativeParam, const VarDecl *TargetParam) const override
Gets the address of the native argument basing on the address of the target-specific parameter...
llvm::Value * getTypeSize(QualType Ty)
Returns calculated size of the specified type.
static bool stable_sort_comparator(const PrivateDataTy P1, const PrivateDataTy P2)
This represents &#39;if&#39; clause in the &#39;#pragma omp ...&#39; directive.
Definition: OpenMPClause.h:242
llvm::Value * ScratchpadIndex
CapturedStmt * getInnermostCapturedStmt()
Get innermost captured statement for the construct.
Definition: StmtOpenMP.h:228
static llvm::Value * castValueToType(CodeGenFunction &CGF, llvm::Value *Val, QualType ValTy, QualType CastTy, SourceLocation Loc)
Cast value to the specified type.
QualType getNonReferenceType() const
If Type is a reference type (e.g., const int&), returns the type that the reference refers to ("const...
Definition: Type.h:6062
The base class of the type hierarchy.
Definition: Type.h:1428
bool isZero() const
isZero - Test whether the quantity equals zero.
Definition: CharUnits.h:116
The l-value was an access to a declared entity or something equivalently strong, like the address of ...
static bool hasNestedSPMDDirective(ASTContext &Ctx, const OMPExecutableDirective &D)
Check for inner (nested) SPMD construct, if any.
Address EmitLoadOfPointer(Address Ptr, const PointerType *PtrTy, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr)
Definition: CGExpr.cpp:2277
Describes the capture of a variable or of this, or of a C++1y init-capture.
Definition: LambdaCapture.h:26
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
Represents a variable declaration or definition.
Definition: Decl.h:814
llvm::Value * getThreadID(CodeGenFunction &CGF, SourceLocation Loc)
Gets thread id value for the current thread.
LangAS getLangASFromTargetAS(unsigned TargetAS)
Definition: AddressSpaces.h:67
This represents &#39;num_threads&#39; clause in the &#39;#pragma omp ...&#39; directive.
Definition: OpenMPClause.h:384
virtual void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> Args=llvm::None) const
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
llvm::Value * getPointer() const
Definition: Address.h:38
unsigned getAddressSpace() const
Return the address space that this address resides in.
Definition: Address.h:57
SPMD execution mode (all threads are worker threads).
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Definition: Decl.h:269
Represents a struct/union/class.
Definition: Decl.h:3570
DataSharingMode
Target codegen is specialized based on two data-sharing modes: CUDA, in which the local variables are...
Address getAddress() const
Definition: CGValue.h:327
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::Type * ConvertType(QualType T)
ConvertType - Convert type T into a llvm::Type.
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:150
attr_iterator attr_end() const
Definition: DeclBase.h:504
The scope used to remap some variables as private in the OpenMP loop body (or other captured region e...
Represents a member of a struct/union/class.
Definition: Decl.h:2534
const CapturedStmt * getCapturedStmt(OpenMPDirectiveKind RegionKind) const
Returns the captured statement associated with the component region within the (combined) directive...
Definition: StmtOpenMP.h:211
static llvm::Value * getMasterThreadID(CodeGenFunction &CGF)
Get the thread id of the OMP master thread.
llvm::CallInst * EmitRuntimeCall(llvm::Value *callee, const Twine &name="")
void startDefinition()
Starts the definition of this tag declaration.
Definition: Decl.cpp:3791
bool isReferenceType() const
Definition: Type.h:6125
void functionFinished(CodeGenFunction &CGF) override
Cleans up references to the objects in finished function.
OpenMPDirectiveKind getDirectiveKind() const
Definition: StmtOpenMP.h:246
Expr * getSubExpr()
Definition: Expr.h:2892
void InitTempAlloca(Address Alloca, llvm::Value *Value)
InitTempAlloca - Provide an initial value for the given alloca which will be observable at all locati...
Definition: CGExpr.cpp:126
void EmitStoreOfScalar(llvm::Value *Value, Address Addr, bool Volatile, QualType Ty, AlignmentSource Source=AlignmentSource::Type, bool isInit=false, bool isNontemporal=false)
EmitStoreOfScalar - Store a scalar value to an address, taking care to appropriately convert from the...
bool EvaluateAsBooleanCondition(bool &Result, const ASTContext &Ctx) const
EvaluateAsBooleanCondition - Return true if this is a constant which we can fold and convert to a boo...
Address CreateElementBitCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Cast the element type of the given address to a different type, preserving information like the align...
Definition: CGBuilder.h:157
CharUnits - This is an opaque type for sizes expressed in character units.
Definition: CharUnits.h:38
bool isOpenMPTeamsDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a teams-kind directive.
uint32_t Offset
Definition: CacheTokens.cpp:43
CharUnits getAlignment() const
Return the alignment of this pointer.
Definition: Address.h:67
child_range children()
Definition: Stmt.cpp:227
virtual void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc) override
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
SourceLocation getLocStart() const LLVM_READONLY
Returns starting location of directive kind.
Definition: StmtOpenMP.h:168
void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc) override
This function ought to emit, in the general case, a call to.
CharUnits getDeclAlign(const Decl *D, bool ForAlignof=false) const
Return a conservative estimate of the alignment of the specified decl D.
static llvm::Value * getNVPTXWarpID(CodeGenFunction &CGF)
Get the id of the warp in the block.
Scope - A scope is a transient data structure that is used while parsing the program.
Definition: Scope.h:40
static CGOpenMPRuntimeNVPTX::DataSharingMode getDataSharingMode(CodeGenModule &CGM)
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> CapturedVars, const Expr *IfCond) override
Emits code for parallel or serial call of the OutlinedFn with variables captured in a record which ad...
void addCompilerUsedGlobal(llvm::GlobalValue *GV)
Add a global to a list to be added to the llvm.compiler.used metadata.
llvm::Value * emitReductionFunction(CodeGenModule &CGM, SourceLocation Loc, llvm::Type *ArgsType, ArrayRef< const Expr *> Privates, ArrayRef< const Expr *> LHSExprs, ArrayRef< const Expr *> RHSExprs, ArrayRef< const Expr *> ReductionOps)
Emits reduction function.
llvm::Value * emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, unsigned Flags=0)
Emits object of ident_t type with info for source location.
A C++ lambda expression, which produces a function object (of unspecified type) that can be invoked l...
Definition: ExprCXX.h:1649
virtual llvm::Value * emitTeamsOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP teams directive D.
bool isInitCapture(const LambdaCapture *Capture) const
Determine whether one of this lambda&#39;s captures is an init-capture.
Definition: ExprCXX.cpp:971
static llvm::Value * createRuntimeShuffleFunction(CodeGenFunction &CGF, llvm::Value *Elem, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
This function creates calls to one of two shuffle functions to copy variables between lanes in a warp...
virtual Decl * getCanonicalDecl()
Retrieves the "canonical" declaration of the given declaration.
Definition: DeclBase.h:877
static const Stmt * getSingleCompoundChild(const Stmt *Body)
Checks if the Body is the CompoundStmt and returns its child statement iff there is only one...
LValue EmitLValueForField(LValue Base, const FieldDecl *Field)
Definition: CGExpr.cpp:3807
llvm::Constant * CreateRuntimeFunction(llvm::FunctionType *Ty, StringRef Name, llvm::AttributeList ExtraAttrs=llvm::AttributeList(), bool Local=false)
Create a new runtime function with the specified type and name.
static void syncCTAThreads(CodeGenFunction &CGF)
Synchronize all GPU threads in a block.
llvm::Value * EmitLoadOfScalar(Address Addr, bool Volatile, QualType Ty, SourceLocation Loc, AlignmentSource Source=AlignmentSource::Type, bool isNontemporal=false)
EmitLoadOfScalar - Load a scalar value from an address, taking care to appropriately convert from the...
static ImplicitParamDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, ImplicitParamKind ParamKind)
Create implicit parameter.
Definition: Decl.cpp:4290
Describes the capture of either a variable, or &#39;this&#39;, or variable-length array type.
Definition: Stmt.h:2138
bool isOpenMPPrivate(OpenMPClauseKind Kind)
Checks if the specified clause is one of private clauses like &#39;private&#39;, &#39;firstprivate&#39;, &#39;reduction&#39; etc.
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition: CharUnits.h:179
TypeSourceInfo * getTrivialTypeSourceInfo(QualType T, SourceLocation Loc=SourceLocation()) const
Allocate a TypeSourceInfo where all locations have been initialized to a given location, which defaults to the empty location.
Address CreateDefaultAlignTempAlloca(llvm::Type *Ty, const Twine &Name="tmp")
CreateDefaultAlignedTempAlloca - This creates an alloca with the default ABI alignment of the given L...
Definition: CGExpr.cpp:119
const Stmt * getAssociatedStmt() const
Returns statement associated with the directive.
Definition: StmtOpenMP.h:198
Represent the declaration of a variable (in which case it is an lvalue) a function (in which case it ...
Definition: Decl.h:637
Expr - This represents one expression.
Definition: Expr.h:106
virtual llvm::Value * emitParallelOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP parallel directive D.
static Address invalid()
Definition: Address.h:35
Enters a new scope for capturing cleanups, all of which will be executed once the scope is exited...
Address getAddressOfLocalVariable(CodeGenFunction &CGF, const VarDecl *VD) override
Gets the OpenMP-specific address of the local variable.
Stmt * IgnoreContainers(bool IgnoreCaptured=false)
Skip no-op (attributed, compound) container stmts and skip captured stmt at the top, if IgnoreCaptured is true.
Definition: Stmt.cpp:133
bool isOpenMPParallelDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a parallel-kind directive.
const CGFunctionInfo & arrangeNullaryFunction()
A nullary function is a freestanding function of type &#39;void ()&#39;.
Definition: CGCall.cpp:695
BlockExpr - Adaptor class for mixing a BlockDecl with expressions.
Definition: Expr.h:5051
const Expr * getCallee() const
Definition: Expr.h:2356
VlaSizePair getVLASize(const VariableArrayType *vla)
Returns an LLVM value that corresponds to the size, in non-variably-sized elements, of a variable length array type, plus that largest non-variably-sized element type.
llvm::PointerType * getType() const
Return the type of the pointer value.
Definition: Address.h:44
CharUnits getTypeAlignInChars(QualType T) const
Return the ABI-specified alignment of a (complete) type T, in characters.
DeclContext * getDeclContext()
Definition: DeclBase.h:428
static llvm::iterator_range< specific_clause_iterator< SpecificClause > > getClausesOfKind(ArrayRef< OMPClause *> Clauses)
Definition: StmtOpenMP.h:130
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition: CharUnits.h:63
CanQualType ShortTy
Definition: ASTContext.h:1013
QualType getConstantArrayType(QualType EltTy, const llvm::APInt &ArySize, ArrayType::ArraySizeModifier ASM, unsigned IndexTypeQuals) const
Return the unique reference to the type for a constant array of the specified element type...
QualType getRecordType(const RecordDecl *Decl) const
UnaryOperator - This represents the unary-expression&#39;s (except sizeof and alignof), the postinc/postdec operators from postfix-expression, and various extensions.
Definition: Expr.h:1805
MachineConfiguration
GPU Configuration: This information can be derived from cuda registers, however, providing compile ti...
ValueDecl * getDecl()
Definition: Expr.h:1059
const LangOptions & getLangOpts() const
ASTContext & getContext() const
OpenMPProcBindClauseKind
OpenMP attributes for &#39;proc_bind&#39; clause.
Definition: OpenMPKinds.h:51
static llvm::Value * emitReduceScratchpadFunction(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc)
This function emits a helper that loads data from the scratchpad array and (optionally) reduces it wi...
Non-SPMD execution mode (1 master thread, others are workers).
llvm::Value * ScratchpadWidth
VarDecl * getCanonicalDecl() override
Retrieves the "canonical" declaration of the given declaration.
Definition: Decl.cpp:2006
GlobalDecl - represents a global declaration.
Definition: GlobalDecl.h:35
bool hasClausesOfKind() const
Returns true if the current directive has one or more clauses of a specific kind. ...
Definition: StmtOpenMP.h:162
AttrVec & getAttrs()
Definition: DeclBase.h:480
bool hasAttrs() const
Definition: DeclBase.h:474
The l-value was considered opaque, so the alignment was determined from a type.
llvm::Value * emitTeamsOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP teams.
SourceLocation getLocStart() const LLVM_READONLY
Definition: DeclBase.h:409
Address CreateBitCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Definition: CGBuilder.h:142
void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> Args=llvm::None) const override
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
This captures a statement into a function.
Definition: Stmt.h:2125
void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value *> CapturedVars) override
Emits code for teams call of the OutlinedFn with variables captured in a record which address is stor...
void emitOMPIfClause(CodeGenFunction &CGF, const Expr *Cond, const RegionCodeGenTy &ThenGen, const RegionCodeGenTy &ElseGen)
Emits code for OpenMP &#39;if&#39; clause using specified CodeGen function.
Encodes a location in the source.
static llvm::Value * getThreadLimit(CodeGenFunction &CGF, bool IsInSPMDExecutionMode=false)
Get the value of the thread_limit clause in the teams directive.
QualType getUIntPtrType() const
Return a type compatible with "uintptr_t" (C99 7.18.1.4), as defined by the target.
Expr * getSubExpr() const
Definition: Expr.h:1832
bool isVariablyModifiedType() const
Whether this type is a variably-modified type (C99 6.7.5).
Definition: Type.h:1958
void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr) override
Emits a critical region.
This is a basic class for representing single OpenMP executable directive.
Definition: StmtOpenMP.h:33
CastKind getCastKind() const
Definition: Expr.h:2886
DeclStmt - Adaptor class for mixing declarations with statements and expressions. ...
Definition: Stmt.h:503
OpenMPDirectiveKind
OpenMP directives.
Definition: OpenMPKinds.h:23
This file defines OpenMP nodes for declarative directives.
This is a basic class for representing single OpenMP clause.
Definition: OpenMPClause.h:51
CanQualType VoidTy
Definition: ASTContext.h:1004
bool isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind)
Checks if the specified directive kind is one of the composite or combined directives that need loop ...
arg_range arguments()
Definition: Expr.h:2410
static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name, bool Mode)
An aligned address.
Definition: Address.h:25
void StartFunction(GlobalDecl GD, QualType RetTy, llvm::Function *Fn, const CGFunctionInfo &FnInfo, const FunctionArgList &Args, SourceLocation Loc=SourceLocation(), SourceLocation StartLoc=SourceLocation())
Emit code for the start of a function.
ImplicitCastExpr - Allows us to explicitly represent implicit type conversions, which have no direct ...
Definition: Expr.h:2961
Stmt * getCapturedStmt()
Retrieve the statement being captured.
Definition: Stmt.h:2226
OpenMPRTLFunctionNVPTX
bool isLValue() const
isLValue - True if this expression is an "l-value" according to the rules of the current language...
Definition: Expr.h:249
virtual void emitProcBindClause(CodeGenFunction &CGF, OpenMPProcBindClauseKind ProcBind, SourceLocation Loc)
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
QualType getType() const
Definition: CGValue.h:264
virtual void functionFinished(CodeGenFunction &CGF)
Cleans up references to the objects in finished function.
const VarDecl * translateParameter(const FieldDecl *FD, const VarDecl *NativeParam) const override
Translates the native parameter of outlined function if this is required for target.
void FinishFunction(SourceLocation EndLoc=SourceLocation())
FinishFunction - Complete IR generation of the current function.
FunctionArgList - Type for representing both the decl and type of parameters to a function...
Definition: CGCall.h:356
void setAction(PrePostActionTy &Action) const
CGFunctionInfo - Class to encapsulate the information about a function definition.
This class organizes the cross-function state that is used while generating LLVM code.
CGOpenMPRuntime & getOpenMPRuntime()
Return a reference to the configured OpenMP runtime.
static ParmVarDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, StorageClass S, Expr *DefArg)
Definition: Decl.cpp:2501
Dataflow Directional Tag Classes.
Class provides a way to call simple version of codegen for OpenMP region, or an advanced with possibl...
static void getNVPTXCTABarrier(CodeGenFunction &CGF)
Get barrier to synchronize all threads in a block.
A qualifier set is used to build a set of qualifiers.
Definition: Type.h:5856
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
Definition: DeclBase.h:1264
ArrayRef< Capture > captures() const
Definition: Decl.h:3990
A basic class for pre|post-action for advanced codegen sequence for OpenMP region.
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
Definition: CGBuilder.h:70
bool isOpenMPSimdDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a simd directive.
static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy, ArrayRef< const Expr *> Privates, Address SrcBase, Address DestBase, CopyOptionsTy CopyOptions={nullptr, nullptr, nullptr})
Emit instructions to copy a Reduce list, which contains partially aggregated values, in the specified direction.
const Type * strip(QualType type)
Collect any qualifiers on the given type and return an unqualified type.
Definition: Type.h:5863
SourceLocation getLocStart() const LLVM_READONLY
Definition: Decl.h:739
Address CreateConstInBoundsGEP(Address Addr, uint64_t Index, CharUnits EltSize, const llvm::Twine &Name="")
Given addr = T* ...
Definition: CGBuilder.h:211
llvm::StoreInst * CreateStore(llvm::Value *Val, Address Addr, bool IsVolatile=false)
Definition: CGBuilder.h:108
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.h:1392
llvm::Module & getModule() const
QualType apply(const ASTContext &Context, QualType QT) const
Apply the collected qualifiers to the given type.
Definition: Type.cpp:3385
LValue MakeAddrLValue(Address Addr, QualType T, AlignmentSource Source=AlignmentSource::Type)
virtual Address emitThreadIDAddress(CodeGenFunction &CGF, SourceLocation Loc)
Emits address of the word in a memory where current thread id is stored.
void getOpenMPCaptureRegions(llvm::SmallVectorImpl< OpenMPDirectiveKind > &CaptureRegions, OpenMPDirectiveKind DKind)
Return the captured regions of an OpenMP directive.
static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads)
Synchronize worker threads in a parallel region.
bool isOpenMPDistributeDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a distribute directive.
llvm::Constant * createNVPTXRuntimeFunction(unsigned Function)
Returns specified OpenMP runtime function for the current OpenMP implementation.
virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr *> Privates, ArrayRef< const Expr *> LHSExprs, ArrayRef< const Expr *> RHSExprs, ArrayRef< const Expr *> ReductionOps, ReductionOptionsTy Options) override
Emit a code for reduction clause.
This file defines OpenMP AST classes for executable directives and clauses.
Address CreateConstArrayGEP(Address Addr, uint64_t Index, CharUnits EltSize, const llvm::Twine &Name="")
Given addr = [n x T]* ...
Definition: CGBuilder.h:195
bool isIntegerType() const
isIntegerType() does not include complex integers (a GCC extension).
Definition: Type.h:6374
void addRestrict()
Definition: Type.h:290
T * getAttr() const
Definition: DeclBase.h:534
llvm::Type * getElementType() const
Return the type of the values stored in this address.
Definition: Address.h:52
Opcode getOpcode() const
Definition: Expr.h:1829
decl_range decls()
Definition: Stmt.h:551
void SetInternalFunctionAttributes(GlobalDecl GD, llvm::Function *F, const CGFunctionInfo &FI)
Set the attributes on the LLVM function for the given decl and function info.
static bool supportsSPMDExecutionMode(ASTContext &Ctx, const OMPExecutableDirective &D)
Internal linkage, which indicates that the entity can be referred to from within the translation unit...
Definition: Linkage.h:32
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
Definition: CGStmt.cpp:445
bool hasAssociatedStmt() const
Returns true if directive has associated statement.
Definition: StmtOpenMP.h:195
ExecutionMode
Defines the execution mode.
void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override
Emits OpenMP-specific function prolog.
bool isLValueReferenceType() const
Definition: Type.h:6129
static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr, Address DestAddr, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
static llvm::Value * emitShuffleAndReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc)
Emit a helper that reduces data across two OpenMP threads (lanes) in the same warp.
int64_t toBits(CharUnits CharSize) const
Convert a size in characters to a size in bits.
virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr *> Privates, ArrayRef< const Expr *> LHSExprs, ArrayRef< const Expr *> RHSExprs, ArrayRef< const Expr *> ReductionOps, ReductionOptionsTy Options)
Emit a code for reduction clause.
bool hasSignedIntegerRepresentation() const
Determine whether this type has an signed integer representation of some sort, e.g., it is an signed integer type or a vector.
Definition: Type.cpp:1878
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block, taking care to avoid creation of branches from dummy blocks.
Definition: CGStmt.cpp:465
Privates[]
Gets the list of initial values for linear variables.
Definition: OpenMPClause.h:145
virtual void emitProcBindClause(CodeGenFunction &CGF, OpenMPProcBindClauseKind ProcBind, SourceLocation Loc) override
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
Capturing by reference.
Definition: Lambda.h:38
LValue EmitLValue(const Expr *E)
EmitLValue - Emit code to compute a designator that specifies the location of the expression...
Definition: CGExpr.cpp:1199
QualType getPointerType(QualType T) const
Return the uniqued reference to the type for a pointer to the specified type.
capture_range captures() const
Retrieve this lambda&#39;s captures.
Definition: ExprCXX.cpp:984
CallExpr - Represents a function call (C99 6.5.2.2, C++ [expr.call]).
Definition: Expr.h:2316
static llvm::Value * getNVPTXThreadID(CodeGenFunction &CGF)
Get the id of the current thread on the GPU.
CGCapturedStmtInfo * CapturedStmtInfo
const VariableArrayType * getAsVariableArrayType(QualType T) const
Definition: ASTContext.h:2398
static llvm::Value * getNVPTXWarpSize(CodeGenFunction &CGF)
Get the GPU warp size.
CanQualType IntTy
Definition: ASTContext.h:1013
llvm::Value * RemoteLaneOffset
void EmitAggregateCopy(LValue Dest, LValue Src, QualType EltTy, AggValueSlot::Overlap_t MayOverlap, bool isVolatile=false)
EmitAggregateCopy - Emit an aggregate copy.
Definition: CGExprAgg.cpp:1818
capture_range captures()
Definition: Stmt.h:2260
A reference to a declared variable, function, enum, etc.
Definition: Expr.h:974
void addAddressSpace(LangAS space)
Definition: Type.h:395
static llvm::Value * emitInterWarpCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr *> Privates, QualType ReductionArrayTy, SourceLocation Loc)
This function emits a helper that gathers Reduce lists from the first lane of every active warp to la...
CharUnits getTypeSizeInChars(QualType T) const
Return the size of the specified (complete) type T, in characters.
static ApplyDebugLocation CreateEmpty(CodeGenFunction &CGF)
Set the IRBuilder to not attach debug locations.
Definition: CGDebugInfo.h:690
QualType getType() const
Definition: Decl.h:648
LValue - This represents an lvalue references.
Definition: CGValue.h:167
Information for lazily generating a cleanup.
Definition: EHScopeStack.h:147
CanQualType BoolTy
Definition: ASTContext.h:1005
unsigned getTargetAddressSpace(QualType T) const
Definition: ASTContext.h:2481
static FieldDecl * Create(const ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, Expr *BW, bool Mutable, InClassInitStyle InitStyle)
Definition: Decl.cpp:3692
Address CreatePointerBitCastOrAddrSpaceCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Definition: CGBuilder.h:164
static llvm::Value * getNVPTXNumThreads(CodeGenFunction &CGF)
Get the maximum number of threads in a block of the GPU.
No in-class initializer.
Definition: Specifiers.h:230
llvm::Value * getPointer() const
Definition: CGValue.h:323
virtual void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc)
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
Attr - This represents one attribute.
Definition: Attr.h:43
SourceLocation getLocation() const
Definition: DeclBase.h:419
QualType getIntTypeForBitwidth(unsigned DestWidth, unsigned Signed) const
getIntTypeForBitwidth - sets integer QualTy according to specified details: bitwidth, signed/unsigned.
static OMPLinearClause * Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc, OpenMPLinearClauseKind Modifier, SourceLocation ModifierLoc, SourceLocation ColonLoc, SourceLocation EndLoc, ArrayRef< Expr *> VL, ArrayRef< Expr *> PL, ArrayRef< Expr *> IL, Expr *Step, Expr *CalcStep, Stmt *PreInit, Expr *PostUpdate)
Creates clause with a list of variables VL and a linear step Step.
CanQualType getSizeType() const
Return the unique type for "size_t" (C99 7.17), defined in <stddef.h>.
llvm::FunctionType * GetFunctionType(const CGFunctionInfo &Info)
GetFunctionType - Get the LLVM function type for.
Definition: CGCall.cpp:1544