clang  5.0.0
CGOpenMPRuntimeNVPTX.cpp
Go to the documentation of this file.
1 //===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This provides a class for OpenMP runtime code generation specialized to NVPTX
11 // targets.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "CGOpenMPRuntimeNVPTX.h"
16 #include "clang/AST/DeclOpenMP.h"
17 #include "CodeGenFunction.h"
18 #include "clang/AST/StmtOpenMP.h"
19 
20 using namespace clang;
21 using namespace CodeGen;
22 
23 namespace {
25  /// \brief Call to void __kmpc_kernel_init(kmp_int32 thread_limit);
26  OMPRTL_NVPTX__kmpc_kernel_init,
27  /// \brief Call to void __kmpc_kernel_deinit();
28  OMPRTL_NVPTX__kmpc_kernel_deinit,
29  /// \brief Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
30  /// short RequiresOMPRuntime, short RequiresDataSharing);
31  OMPRTL_NVPTX__kmpc_spmd_kernel_init,
32  /// \brief Call to void __kmpc_spmd_kernel_deinit();
33  OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,
34  /// \brief Call to void __kmpc_kernel_prepare_parallel(void
35  /// *outlined_function);
36  OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
37  /// \brief Call to bool __kmpc_kernel_parallel(void **outlined_function);
38  OMPRTL_NVPTX__kmpc_kernel_parallel,
39  /// \brief Call to void __kmpc_kernel_end_parallel();
40  OMPRTL_NVPTX__kmpc_kernel_end_parallel,
41  /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
42  /// global_tid);
43  OMPRTL_NVPTX__kmpc_serialized_parallel,
44  /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
45  /// global_tid);
46  OMPRTL_NVPTX__kmpc_end_serialized_parallel,
47  /// \brief Call to int32_t __kmpc_shuffle_int32(int32_t element,
48  /// int16_t lane_offset, int16_t warp_size);
49  OMPRTL_NVPTX__kmpc_shuffle_int32,
50  /// \brief Call to int64_t __kmpc_shuffle_int64(int64_t element,
51  /// int16_t lane_offset, int16_t warp_size);
52  OMPRTL_NVPTX__kmpc_shuffle_int64,
53  /// \brief Call to __kmpc_nvptx_parallel_reduce_nowait(kmp_int32
54  /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
55  /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
56  /// lane_offset, int16_t shortCircuit),
57  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
58  OMPRTL_NVPTX__kmpc_parallel_reduce_nowait,
59  /// \brief Call to __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
60  /// int32_t num_vars, size_t reduce_size, void *reduce_data,
61  /// void (*kmp_ShuffleReductFctPtr)(void *rhs, int16_t lane_id, int16_t
62  /// lane_offset, int16_t shortCircuit),
63  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
64  /// void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
65  /// int32_t index, int32_t width),
66  /// void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad, int32_t
67  /// index, int32_t width, int32_t reduce))
68  OMPRTL_NVPTX__kmpc_teams_reduce_nowait,
69  /// \brief Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
70  OMPRTL_NVPTX__kmpc_end_reduce_nowait
71 };
72 
73 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
74 class NVPTXActionTy final : public PrePostActionTy {
75  llvm::Value *EnterCallee;
76  ArrayRef<llvm::Value *> EnterArgs;
77  llvm::Value *ExitCallee;
78  ArrayRef<llvm::Value *> ExitArgs;
79  bool Conditional;
80  llvm::BasicBlock *ContBlock = nullptr;
81 
82 public:
83  NVPTXActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs,
84  llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs,
85  bool Conditional = false)
86  : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
87  ExitArgs(ExitArgs), Conditional(Conditional) {}
88  void Enter(CodeGenFunction &CGF) override {
89  llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
90  if (Conditional) {
91  llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
92  auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
93  ContBlock = CGF.createBasicBlock("omp_if.end");
94  // Generate the branch (If-stmt)
95  CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
96  CGF.EmitBlock(ThenBlock);
97  }
98  }
99  void Done(CodeGenFunction &CGF) {
100  // Emit the rest of blocks/branches
101  CGF.EmitBranch(ContBlock);
102  CGF.EmitBlock(ContBlock, true);
103  }
104  void Exit(CodeGenFunction &CGF) override {
105  CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
106  }
107 };
108 
109 // A class to track the execution mode when codegening directives within
110 // a target region. The appropriate mode (generic/spmd) is set on entry
111 // to the target region and used by containing directives such as 'parallel'
112 // to emit optimized code.
113 class ExecutionModeRAII {
114 private:
117 
118 public:
119  ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode,
121  : Mode(Mode) {
122  SavedMode = Mode;
123  Mode = NewMode;
124  }
125  ~ExecutionModeRAII() { Mode = SavedMode; }
126 };
127 
128 /// GPU Configuration: This information can be derived from cuda registers,
129 /// however, providing compile time constants helps generate more efficient
130 /// code. For all practical purposes this is fine because the configuration
131 /// is the same for all known NVPTX architectures.
132 enum MachineConfiguration : unsigned {
133  WarpSize = 32,
134  /// Number of bits required to represent a lane identifier, which is
135  /// computed as log_2(WarpSize).
136  LaneIDBits = 5,
137  LaneIDMask = WarpSize - 1,
138 
139  /// Global memory alignment for performance.
140  GlobalMemoryAlignment = 256,
141 };
142 
143 enum NamedBarrier : unsigned {
144  /// Synchronize on this barrier #ID using a named barrier primitive.
145  /// Only the subset of active threads in a parallel region arrive at the
146  /// barrier.
147  NB_Parallel = 1,
148 };
149 } // anonymous namespace
150 
151 /// Get the GPU warp size.
153  CGBuilderTy &Bld = CGF.Builder;
154  return Bld.CreateCall(
155  llvm::Intrinsic::getDeclaration(
156  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
157  llvm::None, "nvptx_warp_size");
158 }
159 
160 /// Get the id of the current thread on the GPU.
162  CGBuilderTy &Bld = CGF.Builder;
163  return Bld.CreateCall(
164  llvm::Intrinsic::getDeclaration(
165  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
166  llvm::None, "nvptx_tid");
167 }
168 
169 /// Get the id of the warp in the block.
170 /// We assume that the warp size is 32, which is always the case
171 /// on the NVPTX device, to generate more efficient code.
173  CGBuilderTy &Bld = CGF.Builder;
174  return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
175 }
176 
177 /// Get the id of the current lane in the Warp.
178 /// We assume that the warp size is 32, which is always the case
179 /// on the NVPTX device, to generate more efficient code.
181  CGBuilderTy &Bld = CGF.Builder;
182  return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
183  "nvptx_lane_id");
184 }
185 
186 /// Get the maximum number of threads in a block of the GPU.
188  CGBuilderTy &Bld = CGF.Builder;
189  return Bld.CreateCall(
190  llvm::Intrinsic::getDeclaration(
191  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
192  llvm::None, "nvptx_num_threads");
193 }
194 
195 /// Get barrier to synchronize all threads in a block.
197  CGBuilderTy &Bld = CGF.Builder;
198  Bld.CreateCall(llvm::Intrinsic::getDeclaration(
199  &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
200 }
201 
202 /// Get barrier #ID to synchronize selected (multiple of warp size) threads in
203 /// a CTA.
204 static void getNVPTXBarrier(CodeGenFunction &CGF, int ID,
205  llvm::Value *NumThreads) {
206  CGBuilderTy &Bld = CGF.Builder;
207  llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads};
208  Bld.CreateCall(llvm::Intrinsic::getDeclaration(&CGF.CGM.getModule(),
209  llvm::Intrinsic::nvvm_barrier),
210  Args);
211 }
212 
213 /// Synchronize all GPU threads in a block.
215 
216 /// Synchronize worker threads in a parallel region.
217 static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads) {
218  return getNVPTXBarrier(CGF, NB_Parallel, NumThreads);
219 }
220 
221 /// Get the value of the thread_limit clause in the teams directive.
222 /// For the 'generic' execution mode, the runtime encodes thread_limit in
223 /// the launch parameters, always starting thread_limit+warpSize threads per
224 /// CTA. The threads in the last warp are reserved for master execution.
225 /// For the 'spmd' execution mode, all threads in a CTA are part of the team.
227  bool IsInSpmdExecutionMode = false) {
228  CGBuilderTy &Bld = CGF.Builder;
229  return IsInSpmdExecutionMode
230  ? getNVPTXNumThreads(CGF)
231  : Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
232  "thread_limit");
233 }
234 
235 /// Get the thread id of the OMP master thread.
236 /// The master thread id is the first thread (lane) of the last warp in the
237 /// GPU block. Warp size is assumed to be some power of 2.
238 /// Thread id is 0 indexed.
239 /// E.g: If NumThreads is 33, master id is 32.
240 /// If NumThreads is 64, master id is 32.
241 /// If NumThreads is 1024, master id is 992.
243  CGBuilderTy &Bld = CGF.Builder;
244  llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
245 
246  // We assume that the warp size is a power of 2.
247  llvm::Value *Mask = Bld.CreateSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
248 
249  return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)),
250  Bld.CreateNot(Mask), "master_tid");
251 }
252 
253 CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
254  CodeGenModule &CGM)
255  : WorkerFn(nullptr), CGFI(nullptr) {
256  createWorkerFunction(CGM);
257 }
258 
259 void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
260  CodeGenModule &CGM) {
261  // Create an worker function with no arguments.
262  CGFI = &CGM.getTypes().arrangeNullaryFunction();
263 
264  WorkerFn = llvm::Function::Create(
266  /* placeholder */ "_worker", &CGM.getModule());
267  CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI);
268 }
269 
270 bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode() const {
271  return CurrentExecutionMode == CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
272 }
273 
276  const OMPExecutableDirective &D) {
277  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
278  switch (DirectiveKind) {
279  case OMPD_target:
280  case OMPD_target_teams:
281  return CGOpenMPRuntimeNVPTX::ExecutionMode::Generic;
282  case OMPD_target_parallel:
283  return CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
284  default:
285  llvm_unreachable("Unsupported directive on NVPTX device.");
286  }
287  llvm_unreachable("Unsupported directive on NVPTX device.");
288 }
289 
290 void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
291  StringRef ParentName,
292  llvm::Function *&OutlinedFn,
293  llvm::Constant *&OutlinedFnID,
294  bool IsOffloadEntry,
295  const RegionCodeGenTy &CodeGen) {
296  ExecutionModeRAII ModeRAII(CurrentExecutionMode,
297  CGOpenMPRuntimeNVPTX::ExecutionMode::Generic);
298  EntryFunctionState EST;
299  WorkerFunctionState WST(CGM);
300  Work.clear();
301 
302  // Emit target region as a standalone region.
303  class NVPTXPrePostActionTy : public PrePostActionTy {
305  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
306  CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
307 
308  public:
309  NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
310  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
311  CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
312  : RT(RT), EST(EST), WST(WST) {}
313  void Enter(CodeGenFunction &CGF) override {
314  RT.emitGenericEntryHeader(CGF, EST, WST);
315  }
316  void Exit(CodeGenFunction &CGF) override {
317  RT.emitGenericEntryFooter(CGF, EST);
318  }
319  } Action(*this, EST, WST);
320  CodeGen.setAction(Action);
321  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
322  IsOffloadEntry, CodeGen);
323 
324  // Create the worker function
325  emitWorkerFunction(WST);
326 
327  // Now change the name of the worker function to correspond to this target
328  // region's entry function.
329  WST.WorkerFn->setName(OutlinedFn->getName() + "_worker");
330 }
331 
332 // Setup NVPTX threads for master-worker OpenMP scheme.
333 void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,
334  EntryFunctionState &EST,
335  WorkerFunctionState &WST) {
336  CGBuilderTy &Bld = CGF.Builder;
337 
338  llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
339  llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
340  llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
341  EST.ExitBB = CGF.createBasicBlock(".exit");
342 
343  auto *IsWorker =
344  Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
345  Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
346 
347  CGF.EmitBlock(WorkerBB);
348  CGF.EmitCallOrInvoke(WST.WorkerFn, llvm::None);
349  CGF.EmitBranch(EST.ExitBB);
350 
351  CGF.EmitBlock(MasterCheckBB);
352  auto *IsMaster =
353  Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
354  Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
355 
356  CGF.EmitBlock(MasterBB);
357  // First action in sequential region:
358  // Initialize the state of the OpenMP runtime library on the GPU.
359  llvm::Value *Args[] = {getThreadLimit(CGF)};
360  CGF.EmitRuntimeCall(
361  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
362 }
363 
364 void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF,
365  EntryFunctionState &EST) {
366  if (!EST.ExitBB)
367  EST.ExitBB = CGF.createBasicBlock(".exit");
368 
369  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
370  CGF.EmitBranch(TerminateBB);
371 
372  CGF.EmitBlock(TerminateBB);
373  // Signal termination condition.
374  CGF.EmitRuntimeCall(
375  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), None);
376  // Barrier to terminate worker threads.
377  syncCTAThreads(CGF);
378  // Master thread jumps to exit point.
379  CGF.EmitBranch(EST.ExitBB);
380 
381  CGF.EmitBlock(EST.ExitBB);
382  EST.ExitBB = nullptr;
383 }
384 
385 void CGOpenMPRuntimeNVPTX::emitSpmdKernel(const OMPExecutableDirective &D,
386  StringRef ParentName,
387  llvm::Function *&OutlinedFn,
388  llvm::Constant *&OutlinedFnID,
389  bool IsOffloadEntry,
390  const RegionCodeGenTy &CodeGen) {
391  ExecutionModeRAII ModeRAII(CurrentExecutionMode,
392  CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd);
393  EntryFunctionState EST;
394 
395  // Emit target region as a standalone region.
396  class NVPTXPrePostActionTy : public PrePostActionTy {
398  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
399  const OMPExecutableDirective &D;
400 
401  public:
402  NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
403  CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
404  const OMPExecutableDirective &D)
405  : RT(RT), EST(EST), D(D) {}
406  void Enter(CodeGenFunction &CGF) override {
407  RT.emitSpmdEntryHeader(CGF, EST, D);
408  }
409  void Exit(CodeGenFunction &CGF) override {
410  RT.emitSpmdEntryFooter(CGF, EST);
411  }
412  } Action(*this, EST, D);
413  CodeGen.setAction(Action);
414  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
415  IsOffloadEntry, CodeGen);
416  return;
417 }
418 
419 void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader(
420  CodeGenFunction &CGF, EntryFunctionState &EST,
421  const OMPExecutableDirective &D) {
422  auto &Bld = CGF.Builder;
423 
424  // Setup BBs in entry function.
425  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
426  EST.ExitBB = CGF.createBasicBlock(".exit");
427 
428  // Initialize the OMP state in the runtime; called by all active threads.
429  // TODO: Set RequiresOMPRuntime and RequiresDataSharing parameters
430  // based on code analysis of the target region.
431  llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSpmdExecutionMode=*/true),
432  /*RequiresOMPRuntime=*/Bld.getInt16(1),
433  /*RequiresDataSharing=*/Bld.getInt16(1)};
434  CGF.EmitRuntimeCall(
435  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
436  CGF.EmitBranch(ExecuteBB);
437 
438  CGF.EmitBlock(ExecuteBB);
439 }
440 
441 void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF,
442  EntryFunctionState &EST) {
443  if (!EST.ExitBB)
444  EST.ExitBB = CGF.createBasicBlock(".exit");
445 
446  llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
447  CGF.EmitBranch(OMPDeInitBB);
448 
449  CGF.EmitBlock(OMPDeInitBB);
450  // DeInitialize the OMP state in the runtime; called by all active threads.
451  CGF.EmitRuntimeCall(
452  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_deinit), None);
453  CGF.EmitBranch(EST.ExitBB);
454 
455  CGF.EmitBlock(EST.ExitBB);
456  EST.ExitBB = nullptr;
457 }
458 
459 // Create a unique global variable to indicate the execution mode of this target
460 // region. The execution mode is either 'generic', or 'spmd' depending on the
461 // target directive. This variable is picked up by the offload library to setup
462 // the device appropriately before kernel launch. If the execution mode is
463 // 'generic', the runtime reserves one warp for the master, otherwise, all
464 // warps participate in parallel work.
465 static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
467  (void)new llvm::GlobalVariable(
468  CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
469  llvm::GlobalValue::WeakAnyLinkage,
470  llvm::ConstantInt::get(CGM.Int8Ty, Mode), Name + Twine("_exec_mode"));
471 }
472 
473 void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
474  auto &Ctx = CGM.getContext();
475 
476  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
477  CGF.disableDebugInfo();
478  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {});
479  emitWorkerLoop(CGF, WST);
480  CGF.FinishFunction();
481 }
482 
483 void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
484  WorkerFunctionState &WST) {
485  //
486  // The workers enter this loop and wait for parallel work from the master.
487  // When the master encounters a parallel region it sets up the work + variable
488  // arguments, and wakes up the workers. The workers first check to see if
489  // they are required for the parallel region, i.e., within the # of requested
490  // parallel threads. The activated workers load the variable arguments and
491  // execute the parallel work.
492  //
493 
494  CGBuilderTy &Bld = CGF.Builder;
495 
496  llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
497  llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
498  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
499  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
500  llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
501  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
502 
503  CGF.EmitBranch(AwaitBB);
504 
505  // Workers wait for work from master.
506  CGF.EmitBlock(AwaitBB);
507  // Wait for parallel work
508  syncCTAThreads(CGF);
509 
510  Address WorkFn =
511  CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
512  Address ExecStatus =
513  CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
514  CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
515  CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
516 
517  llvm::Value *Args[] = {WorkFn.getPointer()};
518  llvm::Value *Ret = CGF.EmitRuntimeCall(
519  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
520  Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
521 
522  // On termination condition (workid == 0), exit loop.
523  llvm::Value *ShouldTerminate =
524  Bld.CreateIsNull(Bld.CreateLoad(WorkFn), "should_terminate");
525  Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
526 
527  // Activate requested workers.
528  CGF.EmitBlock(SelectWorkersBB);
529  llvm::Value *IsActive =
530  Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
531  Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
532 
533  // Signal start of parallel region.
534  CGF.EmitBlock(ExecuteBB);
535 
536  // Process work items: outlined parallel functions.
537  for (auto *W : Work) {
538  // Try to match this outlined function.
540 
541  llvm::Value *WorkFnMatch =
542  Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
543 
544  llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
545  llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
546  Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
547 
548  // Execute this outlined function.
549  CGF.EmitBlock(ExecuteFNBB);
550 
551  // Insert call to work function.
552  // FIXME: Pass arguments to outlined function from master thread.
553  auto *Fn = cast<llvm::Function>(W);
554  Address ZeroAddr =
555  CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, /*Name=*/".zero.addr");
556  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C=*/0));
557  llvm::Value *FnArgs[] = {ZeroAddr.getPointer(), ZeroAddr.getPointer()};
558  CGF.EmitCallOrInvoke(Fn, FnArgs);
559 
560  // Go to end of parallel region.
561  CGF.EmitBranch(TerminateBB);
562 
563  CGF.EmitBlock(CheckNextBB);
564  }
565 
566  // Signal end of parallel region.
567  CGF.EmitBlock(TerminateBB);
568  CGF.EmitRuntimeCall(
569  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
570  llvm::None);
571  CGF.EmitBranch(BarrierBB);
572 
573  // All active and inactive workers wait at a barrier after parallel region.
574  CGF.EmitBlock(BarrierBB);
575  // Barrier after parallel region.
576  syncCTAThreads(CGF);
577  CGF.EmitBranch(AwaitBB);
578 
579  // Exit target region.
580  CGF.EmitBlock(ExitBB);
581 }
582 
583 /// \brief Returns specified OpenMP runtime function for the current OpenMP
584 /// implementation. Specialized for the NVPTX device.
585 /// \param Function OpenMP runtime function.
586 /// \return Specified function.
587 llvm::Constant *
588 CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
589  llvm::Constant *RTLFn = nullptr;
590  switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
591  case OMPRTL_NVPTX__kmpc_kernel_init: {
592  // Build void __kmpc_kernel_init(kmp_int32 thread_limit);
593  llvm::Type *TypeParams[] = {CGM.Int32Ty};
594  llvm::FunctionType *FnTy =
595  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
596  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
597  break;
598  }
599  case OMPRTL_NVPTX__kmpc_kernel_deinit: {
600  // Build void __kmpc_kernel_deinit();
601  llvm::FunctionType *FnTy =
602  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
603  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
604  break;
605  }
606  case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
607  // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
608  // short RequiresOMPRuntime, short RequiresDataSharing);
609  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
610  llvm::FunctionType *FnTy =
611  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
612  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
613  break;
614  }
615  case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: {
616  // Build void __kmpc_spmd_kernel_deinit();
617  llvm::FunctionType *FnTy =
618  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
619  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit");
620  break;
621  }
622  case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
623  /// Build void __kmpc_kernel_prepare_parallel(
624  /// void *outlined_function);
625  llvm::Type *TypeParams[] = {CGM.Int8PtrTy};
626  llvm::FunctionType *FnTy =
627  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
628  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
629  break;
630  }
631  case OMPRTL_NVPTX__kmpc_kernel_parallel: {
632  /// Build bool __kmpc_kernel_parallel(void **outlined_function);
633  llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy};
634  llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
635  llvm::FunctionType *FnTy =
636  llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
637  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
638  break;
639  }
640  case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
641  /// Build void __kmpc_kernel_end_parallel();
642  llvm::FunctionType *FnTy =
643  llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
644  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
645  break;
646  }
647  case OMPRTL_NVPTX__kmpc_serialized_parallel: {
648  // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
649  // global_tid);
650  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
651  llvm::FunctionType *FnTy =
652  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
653  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
654  break;
655  }
656  case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
657  // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
658  // global_tid);
659  llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
660  llvm::FunctionType *FnTy =
661  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
662  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
663  break;
664  }
665  case OMPRTL_NVPTX__kmpc_shuffle_int32: {
666  // Build int32_t __kmpc_shuffle_int32(int32_t element,
667  // int16_t lane_offset, int16_t warp_size);
668  llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
669  llvm::FunctionType *FnTy =
670  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
671  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
672  break;
673  }
674  case OMPRTL_NVPTX__kmpc_shuffle_int64: {
675  // Build int64_t __kmpc_shuffle_int64(int64_t element,
676  // int16_t lane_offset, int16_t warp_size);
677  llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
678  llvm::FunctionType *FnTy =
679  llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false);
680  RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
681  break;
682  }
683  case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: {
684  // Build int32_t kmpc_nvptx_parallel_reduce_nowait(kmp_int32 global_tid,
685  // kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
686  // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
687  // lane_offset, int16_t Algorithm Version),
688  // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
689  llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
690  CGM.Int16Ty, CGM.Int16Ty};
691  auto *ShuffleReduceFnTy =
692  llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
693  /*isVarArg=*/false);
694  llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
695  auto *InterWarpCopyFnTy =
696  llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
697  /*isVarArg=*/false);
698  llvm::Type *TypeParams[] = {CGM.Int32Ty,
699  CGM.Int32Ty,
700  CGM.SizeTy,
701  CGM.VoidPtrTy,
702  ShuffleReduceFnTy->getPointerTo(),
703  InterWarpCopyFnTy->getPointerTo()};
704  llvm::FunctionType *FnTy =
705  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
706  RTLFn = CGM.CreateRuntimeFunction(
707  FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait");
708  break;
709  }
710  case OMPRTL_NVPTX__kmpc_teams_reduce_nowait: {
711  // Build int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
712  // int32_t num_vars, size_t reduce_size, void *reduce_data,
713  // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
714  // lane_offset, int16_t shortCircuit),
715  // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
716  // void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
717  // int32_t index, int32_t width),
718  // void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad,
719  // int32_t index, int32_t width, int32_t reduce))
720  llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
721  CGM.Int16Ty, CGM.Int16Ty};
722  auto *ShuffleReduceFnTy =
723  llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
724  /*isVarArg=*/false);
725  llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
726  auto *InterWarpCopyFnTy =
727  llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
728  /*isVarArg=*/false);
729  llvm::Type *CopyToScratchpadTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy,
730  CGM.Int32Ty, CGM.Int32Ty};
731  auto *CopyToScratchpadFnTy =
732  llvm::FunctionType::get(CGM.VoidTy, CopyToScratchpadTypeParams,
733  /*isVarArg=*/false);
734  llvm::Type *LoadReduceTypeParams[] = {
735  CGM.VoidPtrTy, CGM.VoidPtrTy, CGM.Int32Ty, CGM.Int32Ty, CGM.Int32Ty};
736  auto *LoadReduceFnTy =
737  llvm::FunctionType::get(CGM.VoidTy, LoadReduceTypeParams,
738  /*isVarArg=*/false);
739  llvm::Type *TypeParams[] = {CGM.Int32Ty,
740  CGM.Int32Ty,
741  CGM.SizeTy,
742  CGM.VoidPtrTy,
743  ShuffleReduceFnTy->getPointerTo(),
744  InterWarpCopyFnTy->getPointerTo(),
745  CopyToScratchpadFnTy->getPointerTo(),
746  LoadReduceFnTy->getPointerTo()};
747  llvm::FunctionType *FnTy =
748  llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
749  RTLFn = CGM.CreateRuntimeFunction(
750  FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait");
751  break;
752  }
753  case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
754  // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
755  llvm::Type *TypeParams[] = {CGM.Int32Ty};
756  llvm::FunctionType *FnTy =
757  llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
758  RTLFn = CGM.CreateRuntimeFunction(
759  FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
760  break;
761  }
762  }
763  return RTLFn;
764 }
765 
766 void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
767  llvm::Constant *Addr,
768  uint64_t Size, int32_t) {
769  auto *F = dyn_cast<llvm::Function>(Addr);
770  // TODO: Add support for global variables on the device after declare target
771  // support.
772  if (!F)
773  return;
774  llvm::Module *M = F->getParent();
775  llvm::LLVMContext &Ctx = M->getContext();
776 
777  // Get "nvvm.annotations" metadata node
778  llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
779 
780  llvm::Metadata *MDVals[] = {
781  llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "kernel"),
782  llvm::ConstantAsMetadata::get(
783  llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
784  // Append metadata to nvvm.annotations
785  MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
786 }
787 
789  const OMPExecutableDirective &D, StringRef ParentName,
790  llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
791  bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
792  if (!IsOffloadEntry) // Nothing to do.
793  return;
794 
795  assert(!ParentName.empty() && "Invalid target region parent name!");
796 
799  switch (Mode) {
800  case CGOpenMPRuntimeNVPTX::ExecutionMode::Generic:
801  emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
802  CodeGen);
803  break;
804  case CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd:
805  emitSpmdKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
806  CodeGen);
807  break;
809  llvm_unreachable(
810  "Unknown programming model for OpenMP directive on NVPTX target.");
811  }
812 
813  setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
814 }
815 
816 CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
817  : CGOpenMPRuntime(CGM), CurrentExecutionMode(ExecutionMode::Unknown) {
818  if (!CGM.getLangOpts().OpenMPIsDevice)
819  llvm_unreachable("OpenMP NVPTX can only handle device code.");
820 }
821 
823  OpenMPProcBindClauseKind ProcBind,
824  SourceLocation Loc) {
825  // Do nothing in case of Spmd mode and L0 parallel.
826  // TODO: If in Spmd mode and L1 parallel emit the clause.
827  if (isInSpmdExecutionMode())
828  return;
829 
830  CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
831 }
832 
834  llvm::Value *NumThreads,
835  SourceLocation Loc) {
836  // Do nothing in case of Spmd mode and L0 parallel.
837  // TODO: If in Spmd mode and L1 parallel emit the clause.
838  if (isInSpmdExecutionMode())
839  return;
840 
841  CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
842 }
843 
845  const Expr *NumTeams,
846  const Expr *ThreadLimit,
847  SourceLocation Loc) {}
848 
850  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
851  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
852  return CGOpenMPRuntime::emitParallelOutlinedFunction(D, ThreadIDVar,
853  InnermostKind, CodeGen);
854 }
855 
857  const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
858  OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
859 
861  D, ThreadIDVar, InnermostKind, CodeGen);
862  llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
863  OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
864  OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
865  OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
866 
867  return OutlinedFun;
868 }
869 
871  const OMPExecutableDirective &D,
872  SourceLocation Loc,
873  llvm::Value *OutlinedFn,
874  ArrayRef<llvm::Value *> CapturedVars) {
875  if (!CGF.HaveInsertPoint())
876  return;
877 
878  Address ZeroAddr =
880  /*Name*/ ".zero.addr");
881  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
883  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
884  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
885  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
886  CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs);
887 }
888 
890  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
891  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
892  if (!CGF.HaveInsertPoint())
893  return;
894 
895  if (isInSpmdExecutionMode())
896  emitSpmdParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
897  else
898  emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
899 }
900 
901 void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
902  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
903  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
904  llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
905 
906  auto &&L0ParallelGen = [this, Fn](CodeGenFunction &CGF, PrePostActionTy &) {
907  CGBuilderTy &Bld = CGF.Builder;
908 
909  // Prepare for parallel region. Indicate the outlined function.
910  llvm::Value *Args[] = {Bld.CreateBitOrPointerCast(Fn, CGM.Int8PtrTy)};
911  CGF.EmitRuntimeCall(
912  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
913  Args);
914 
915  // Activate workers. This barrier is used by the master to signal
916  // work for the workers.
917  syncCTAThreads(CGF);
918 
919  // OpenMP [2.5, Parallel Construct, p.49]
920  // There is an implied barrier at the end of a parallel region. After the
921  // end of a parallel region, only the master thread of the team resumes
922  // execution of the enclosing task region.
923  //
924  // The master waits at this barrier until all workers are done.
925  syncCTAThreads(CGF);
926 
927  // Remember for post-processing in worker loop.
928  Work.push_back(Fn);
929  };
930 
931  auto *RTLoc = emitUpdateLocation(CGF, Loc);
932  auto *ThreadID = getThreadID(CGF, Loc);
933  llvm::Value *Args[] = {RTLoc, ThreadID};
934 
935  auto &&SeqGen = [this, Fn, &CapturedVars, &Args](CodeGenFunction &CGF,
936  PrePostActionTy &) {
937  auto &&CodeGen = [this, Fn, &CapturedVars](CodeGenFunction &CGF,
939  Action.Enter(CGF);
940 
942  OutlinedFnArgs.push_back(
943  llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
944  OutlinedFnArgs.push_back(
945  llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
946  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
947  CGF.EmitCallOrInvoke(Fn, OutlinedFnArgs);
948  };
949 
950  RegionCodeGenTy RCG(CodeGen);
951  NVPTXActionTy Action(
952  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
953  Args,
954  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
955  Args);
956  RCG.setAction(Action);
957  RCG(CGF);
958  };
959 
960  if (IfCond)
961  emitOMPIfClause(CGF, IfCond, L0ParallelGen, SeqGen);
962  else {
964  RegionCodeGenTy ThenRCG(L0ParallelGen);
965  ThenRCG(CGF);
966  }
967 }
968 
969 void CGOpenMPRuntimeNVPTX::emitSpmdParallelCall(
970  CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
971  ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
972  // Just call the outlined function to execute the parallel region.
973  // OutlinedFn(&GTid, &zero, CapturedStruct);
974  //
975  // TODO: Do something with IfCond when support for the 'if' clause
976  // is added on Spmd target directives.
978  OutlinedFnArgs.push_back(
979  llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
980  OutlinedFnArgs.push_back(
981  llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
982  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
983  CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs);
984 }
985 
986 /// This function creates calls to one of two shuffle functions to copy
987 /// variables between lanes in a warp.
989  QualType ElemTy,
990  llvm::Value *Elem,
991  llvm::Value *Offset) {
992  auto &CGM = CGF.CGM;
993  auto &C = CGM.getContext();
994  auto &Bld = CGF.Builder;
996  *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));
997 
998  unsigned Size = CGM.getContext().getTypeSizeInChars(ElemTy).getQuantity();
999  assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction.");
1000 
1001  OpenMPRTLFunctionNVPTX ShuffleFn = Size <= 4
1002  ? OMPRTL_NVPTX__kmpc_shuffle_int32
1003  : OMPRTL_NVPTX__kmpc_shuffle_int64;
1004 
1005  // Cast all types to 32- or 64-bit values before calling shuffle routines.
1006  auto CastTy = Size <= 4 ? CGM.Int32Ty : CGM.Int64Ty;
1007  auto *ElemCast = Bld.CreateSExtOrBitCast(Elem, CastTy);
1008  auto *WarpSize = CGF.EmitScalarConversion(
1009  getNVPTXWarpSize(CGF), C.getIntTypeForBitwidth(32, /* Signed */ true),
1010  C.getIntTypeForBitwidth(16, /* Signed */ true), SourceLocation());
1011 
1012  auto *ShuffledVal =
1013  CGF.EmitRuntimeCall(RT.createNVPTXRuntimeFunction(ShuffleFn),
1014  {ElemCast, Offset, WarpSize});
1015 
1016  return Bld.CreateTruncOrBitCast(ShuffledVal, CGF.ConvertTypeForMem(ElemTy));
1017 }
1018 
1019 namespace {
1020 enum CopyAction : unsigned {
1021  // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1022  // the warp using shuffle instructions.
1023  RemoteLaneToThread,
1024  // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1025  ThreadCopy,
1026  // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
1027  ThreadToScratchpad,
1028  // ScratchpadToThread: Copy from a scratchpad array in global memory
1029  // containing team-reduced data to a thread's stack.
1030  ScratchpadToThread,
1031 };
1032 } // namespace
1033 
1038 };
1039 
1040 /// Emit instructions to copy a Reduce list, which contains partially
1041 /// aggregated values, in the specified direction.
1043  CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1044  ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1045  CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
1046 
1047  auto &CGM = CGF.CGM;
1048  auto &C = CGM.getContext();
1049  auto &Bld = CGF.Builder;
1050 
1051  auto *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1052  auto *ScratchpadIndex = CopyOptions.ScratchpadIndex;
1053  auto *ScratchpadWidth = CopyOptions.ScratchpadWidth;
1054 
1055  // Iterates, element-by-element, through the source Reduce list and
1056  // make a copy.
1057  unsigned Idx = 0;
1058  unsigned Size = Privates.size();
1059  for (auto &Private : Privates) {
1060  Address SrcElementAddr = Address::invalid();
1061  Address DestElementAddr = Address::invalid();
1062  Address DestElementPtrAddr = Address::invalid();
1063  // Should we shuffle in an element from a remote lane?
1064  bool ShuffleInElement = false;
1065  // Set to true to update the pointer in the dest Reduce list to a
1066  // newly created element.
1067  bool UpdateDestListPtr = false;
1068  // Increment the src or dest pointer to the scratchpad, for each
1069  // new element.
1070  bool IncrScratchpadSrc = false;
1071  bool IncrScratchpadDest = false;
1072 
1073  switch (Action) {
1074  case RemoteLaneToThread: {
1075  // Step 1.1: Get the address for the src element in the Reduce list.
1076  Address SrcElementPtrAddr =
1077  Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
1078  llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar(
1079  SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1080  SrcElementAddr =
1081  Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType()));
1082 
1083  // Step 1.2: Create a temporary to store the element in the destination
1084  // Reduce list.
1085  DestElementPtrAddr =
1086  Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
1087  DestElementAddr =
1088  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1089  ShuffleInElement = true;
1090  UpdateDestListPtr = true;
1091  break;
1092  }
1093  case ThreadCopy: {
1094  // Step 1.1: Get the address for the src element in the Reduce list.
1095  Address SrcElementPtrAddr =
1096  Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
1097  llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar(
1098  SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1099  SrcElementAddr =
1100  Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType()));
1101 
1102  // Step 1.2: Get the address for dest element. The destination
1103  // element has already been created on the thread's stack.
1104  DestElementPtrAddr =
1105  Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
1106  llvm::Value *DestElementPtr =
1107  CGF.EmitLoadOfScalar(DestElementPtrAddr, /*Volatile=*/false,
1108  C.VoidPtrTy, SourceLocation());
1109  Address DestElemAddr =
1110  Address(DestElementPtr, C.getTypeAlignInChars(Private->getType()));
1111  DestElementAddr = Bld.CreateElementBitCast(
1112  DestElemAddr, CGF.ConvertTypeForMem(Private->getType()));
1113  break;
1114  }
1115  case ThreadToScratchpad: {
1116  // Step 1.1: Get the address for the src element in the Reduce list.
1117  Address SrcElementPtrAddr =
1118  Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
1119  llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar(
1120  SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1121  SrcElementAddr =
1122  Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType()));
1123 
1124  // Step 1.2: Get the address for dest element:
1125  // address = base + index * ElementSizeInChars.
1126  unsigned ElementSizeInChars =
1127  C.getTypeSizeInChars(Private->getType()).getQuantity();
1128  auto *CurrentOffset =
1129  Bld.CreateMul(llvm::ConstantInt::get(CGM.SizeTy, ElementSizeInChars),
1130  ScratchpadIndex);
1131  auto *ScratchPadElemAbsolutePtrVal =
1132  Bld.CreateAdd(DestBase.getPointer(), CurrentOffset);
1133  ScratchPadElemAbsolutePtrVal =
1134  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1135  Address ScratchpadPtr =
1136  Address(ScratchPadElemAbsolutePtrVal,
1137  C.getTypeAlignInChars(Private->getType()));
1138  DestElementAddr = Bld.CreateElementBitCast(
1139  ScratchpadPtr, CGF.ConvertTypeForMem(Private->getType()));
1140  IncrScratchpadDest = true;
1141  break;
1142  }
1143  case ScratchpadToThread: {
1144  // Step 1.1: Get the address for the src element in the scratchpad.
1145  // address = base + index * ElementSizeInChars.
1146  unsigned ElementSizeInChars =
1147  C.getTypeSizeInChars(Private->getType()).getQuantity();
1148  auto *CurrentOffset =
1149  Bld.CreateMul(llvm::ConstantInt::get(CGM.SizeTy, ElementSizeInChars),
1150  ScratchpadIndex);
1151  auto *ScratchPadElemAbsolutePtrVal =
1152  Bld.CreateAdd(SrcBase.getPointer(), CurrentOffset);
1153  ScratchPadElemAbsolutePtrVal =
1154  Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1155  SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
1156  C.getTypeAlignInChars(Private->getType()));
1157  IncrScratchpadSrc = true;
1158 
1159  // Step 1.2: Create a temporary to store the element in the destination
1160  // Reduce list.
1161  DestElementPtrAddr =
1162  Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
1163  DestElementAddr =
1164  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1165  UpdateDestListPtr = true;
1166  break;
1167  }
1168  }
1169 
1170  // Regardless of src and dest of copy, we emit the load of src
1171  // element as this is required in all directions
1172  SrcElementAddr = Bld.CreateElementBitCast(
1173  SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
1174  llvm::Value *Elem =
1175  CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
1176  Private->getType(), SourceLocation());
1177 
1178  // Now that all active lanes have read the element in the
1179  // Reduce list, shuffle over the value from the remote lane.
1180  if (ShuffleInElement) {
1181  Elem = createRuntimeShuffleFunction(CGF, Private->getType(), Elem,
1182  RemoteLaneOffset);
1183  }
1184 
1185  // Store the source element value to the dest element address.
1186  CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
1187  Private->getType());
1188 
1189  // Step 3.1: Modify reference in dest Reduce list as needed.
1190  // Modifying the reference in Reduce list to point to the newly
1191  // created element. The element is live in the current function
1192  // scope and that of functions it invokes (i.e., reduce_function).
1193  // RemoteReduceData[i] = (void*)&RemoteElem
1194  if (UpdateDestListPtr) {
1196  DestElementAddr.getPointer(), CGF.VoidPtrTy),
1197  DestElementPtrAddr, /*Volatile=*/false,
1198  C.VoidPtrTy);
1199  }
1200 
1201  // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
1202  // address of the next element in scratchpad memory, unless we're currently
1203  // processing the last one. Memory alignment is also taken care of here.
1204  if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
1205  llvm::Value *ScratchpadBasePtr =
1206  IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
1207  unsigned ElementSizeInChars =
1208  C.getTypeSizeInChars(Private->getType()).getQuantity();
1209  ScratchpadBasePtr = Bld.CreateAdd(
1210  ScratchpadBasePtr,
1211  Bld.CreateMul(ScratchpadWidth, llvm::ConstantInt::get(
1212  CGM.SizeTy, ElementSizeInChars)));
1213 
1214  // Take care of global memory alignment for performance
1215  ScratchpadBasePtr = Bld.CreateSub(ScratchpadBasePtr,
1216  llvm::ConstantInt::get(CGM.SizeTy, 1));
1217  ScratchpadBasePtr = Bld.CreateSDiv(
1218  ScratchpadBasePtr,
1219  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1220  ScratchpadBasePtr = Bld.CreateAdd(ScratchpadBasePtr,
1221  llvm::ConstantInt::get(CGM.SizeTy, 1));
1222  ScratchpadBasePtr = Bld.CreateMul(
1223  ScratchpadBasePtr,
1224  llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1225 
1226  if (IncrScratchpadDest)
1227  DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
1228  else /* IncrScratchpadSrc = true */
1229  SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
1230  }
1231 
1232  Idx++;
1233  }
1234 }
1235 
1236 /// This function emits a helper that loads data from the scratchpad array
1237 /// and (optionally) reduces it with the input operand.
1238 ///
1239 /// load_and_reduce(local, scratchpad, index, width, should_reduce)
1240 /// reduce_data remote;
1241 /// for elem in remote:
1242 /// remote.elem = Scratchpad[elem_id][index]
1243 /// if (should_reduce)
1244 /// local = local @ remote
1245 /// else
1246 /// local = remote
1247 static llvm::Value *
1249  ArrayRef<const Expr *> Privates,
1250  QualType ReductionArrayTy, llvm::Value *ReduceFn) {
1251  auto &C = CGM.getContext();
1252  auto Int32Ty = C.getIntTypeForBitwidth(32, /* Signed */ true);
1253 
1254  // Destination of the copy.
1255  ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
1256  // Base address of the scratchpad array, with each element storing a
1257  // Reduce list per team.
1258  ImplicitParamDecl ScratchPadArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
1259  // A source index into the scratchpad array.
1260  ImplicitParamDecl IndexArg(C, Int32Ty, ImplicitParamDecl::Other);
1261  // Row width of an element in the scratchpad array, typically
1262  // the number of teams.
1263  ImplicitParamDecl WidthArg(C, Int32Ty, ImplicitParamDecl::Other);
1264  // If should_reduce == 1, then it's load AND reduce,
1265  // If should_reduce == 0 (or otherwise), then it only loads (+ copy).
1266  // The latter case is used for initialization.
1267  ImplicitParamDecl ShouldReduceArg(C, Int32Ty, ImplicitParamDecl::Other);
1268 
1269  FunctionArgList Args;
1270  Args.push_back(&ReduceListArg);
1271  Args.push_back(&ScratchPadArg);
1272  Args.push_back(&IndexArg);
1273  Args.push_back(&WidthArg);
1274  Args.push_back(&ShouldReduceArg);
1275 
1276  auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1277  auto *Fn = llvm::Function::Create(
1279  "_omp_reduction_load_and_reduce", &CGM.getModule());
1280  CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI);
1281  CodeGenFunction CGF(CGM);
1282  // We don't need debug information in this function as nothing here refers to
1283  // user code.
1284  CGF.disableDebugInfo();
1285  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
1286 
1287  auto &Bld = CGF.Builder;
1288 
1289  // Get local Reduce list pointer.
1290  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1291  Address ReduceListAddr(
1293  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
1294  C.VoidPtrTy, SourceLocation()),
1295  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1296  CGF.getPointerAlign());
1297 
1298  Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
1299  llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
1300  AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1301 
1302  Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
1303  llvm::Value *IndexVal =
1304  Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false,
1305  Int32Ty, SourceLocation()),
1306  CGM.SizeTy, /*isSigned=*/true);
1307 
1308  Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
1309  llvm::Value *WidthVal =
1310  Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,
1311  Int32Ty, SourceLocation()),
1312  CGM.SizeTy, /*isSigned=*/true);
1313 
1314  Address AddrShouldReduceArg = CGF.GetAddrOfLocalVar(&ShouldReduceArg);
1315  llvm::Value *ShouldReduceVal = CGF.EmitLoadOfScalar(
1316  AddrShouldReduceArg, /*Volatile=*/false, Int32Ty, SourceLocation());
1317 
1318  // The absolute ptr address to the base addr of the next element to copy.
1319  llvm::Value *CumulativeElemBasePtr =
1320  Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
1321  Address SrcDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
1322 
1323  // Create a Remote Reduce list to store the elements read from the
1324  // scratchpad array.
1325  Address RemoteReduceList =
1326  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_red_list");
1327 
1328  // Assemble remote Reduce list from scratchpad array.
1329  emitReductionListCopy(ScratchpadToThread, CGF, ReductionArrayTy, Privates,
1330  SrcDataAddr, RemoteReduceList,
1331  {/*RemoteLaneOffset=*/nullptr,
1332  /*ScratchpadIndex=*/IndexVal,
1333  /*ScratchpadWidth=*/WidthVal});
1334 
1335  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1336  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1337  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1338 
1339  auto CondReduce = Bld.CreateICmpEQ(ShouldReduceVal, Bld.getInt32(1));
1340  Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
1341 
1342  CGF.EmitBlock(ThenBB);
1343  // We should reduce with the local Reduce list.
1344  // reduce_function(LocalReduceList, RemoteReduceList)
1346  ReduceListAddr.getPointer(), CGF.VoidPtrTy);
1347  llvm::Value *RemoteDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1348  RemoteReduceList.getPointer(), CGF.VoidPtrTy);
1349  CGF.EmitCallOrInvoke(ReduceFn, {LocalDataPtr, RemoteDataPtr});
1350  Bld.CreateBr(MergeBB);
1351 
1352  CGF.EmitBlock(ElseBB);
1353  // No reduction; just copy:
1354  // Local Reduce list = Remote Reduce list.
1355  emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
1356  RemoteReduceList, ReduceListAddr);
1357  Bld.CreateBr(MergeBB);
1358 
1359  CGF.EmitBlock(MergeBB);
1360 
1361  CGF.FinishFunction();
1362  return Fn;
1363 }
1364 
1365 /// This function emits a helper that stores reduced data from the team
1366 /// master to a scratchpad array in global memory.
1367 ///
1368 /// for elem in Reduce List:
1369 /// scratchpad[elem_id][index] = elem
1370 ///
1372  ArrayRef<const Expr *> Privates,
1373  QualType ReductionArrayTy) {
1374 
1375  auto &C = CGM.getContext();
1376  auto Int32Ty = C.getIntTypeForBitwidth(32, /* Signed */ true);
1377 
1378  // Source of the copy.
1379  ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
1380  // Base address of the scratchpad array, with each element storing a
1381  // Reduce list per team.
1382  ImplicitParamDecl ScratchPadArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
1383  // A destination index into the scratchpad array, typically the team
1384  // identifier.
1385  ImplicitParamDecl IndexArg(C, Int32Ty, ImplicitParamDecl::Other);
1386  // Row width of an element in the scratchpad array, typically
1387  // the number of teams.
1388  ImplicitParamDecl WidthArg(C, Int32Ty, ImplicitParamDecl::Other);
1389 
1390  FunctionArgList Args;
1391  Args.push_back(&ReduceListArg);
1392  Args.push_back(&ScratchPadArg);
1393  Args.push_back(&IndexArg);
1394  Args.push_back(&WidthArg);
1395 
1396  auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1397  auto *Fn = llvm::Function::Create(
1399  "_omp_reduction_copy_to_scratchpad", &CGM.getModule());
1400  CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI);
1401  CodeGenFunction CGF(CGM);
1402  // We don't need debug information in this function as nothing here refers to
1403  // user code.
1404  CGF.disableDebugInfo();
1405  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
1406 
1407  auto &Bld = CGF.Builder;
1408 
1409  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1410  Address SrcDataAddr(
1412  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
1413  C.VoidPtrTy, SourceLocation()),
1414  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1415  CGF.getPointerAlign());
1416 
1417  Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
1418  llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
1419  AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1420 
1421  Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
1422  llvm::Value *IndexVal =
1423  Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false,
1424  Int32Ty, SourceLocation()),
1425  CGF.SizeTy, /*isSigned=*/true);
1426 
1427  Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
1428  llvm::Value *WidthVal =
1429  Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,
1430  Int32Ty, SourceLocation()),
1431  CGF.SizeTy, /*isSigned=*/true);
1432 
1433  // The absolute ptr address to the base addr of the next element to copy.
1434  llvm::Value *CumulativeElemBasePtr =
1435  Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
1436  Address DestDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
1437 
1438  emitReductionListCopy(ThreadToScratchpad, CGF, ReductionArrayTy, Privates,
1439  SrcDataAddr, DestDataAddr,
1440  {/*RemoteLaneOffset=*/nullptr,
1441  /*ScratchpadIndex=*/IndexVal,
1442  /*ScratchpadWidth=*/WidthVal});
1443 
1444  CGF.FinishFunction();
1445  return Fn;
1446 }
1447 
1448 /// This function emits a helper that gathers Reduce lists from the first
1449 /// lane of every active warp to lanes in the first warp.
1450 ///
1451 /// void inter_warp_copy_func(void* reduce_data, num_warps)
1452 /// shared smem[warp_size];
1453 /// For all data entries D in reduce_data:
1454 /// If (I am the first lane in each warp)
1455 /// Copy my local D to smem[warp_id]
1456 /// sync
1457 /// if (I am the first warp)
1458 /// Copy smem[thread_id] to my local D
1459 /// sync
1461  ArrayRef<const Expr *> Privates,
1462  QualType ReductionArrayTy) {
1463  auto &C = CGM.getContext();
1464  auto &M = CGM.getModule();
1465 
1466  // ReduceList: thread local Reduce list.
1467  // At the stage of the computation when this function is called, partially
1468  // aggregated values reside in the first lane of every active warp.
1469  ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
1470  // NumWarps: number of warps active in the parallel region. This could
1471  // be smaller than 32 (max warps in a CTA) for partial block reduction.
1472  ImplicitParamDecl NumWarpsArg(C,
1473  C.getIntTypeForBitwidth(32, /* Signed */ true),
1475  FunctionArgList Args;
1476  Args.push_back(&ReduceListArg);
1477  Args.push_back(&NumWarpsArg);
1478 
1479  auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1480  auto *Fn = llvm::Function::Create(
1482  "_omp_reduction_inter_warp_copy_func", &CGM.getModule());
1483  CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI);
1484  CodeGenFunction CGF(CGM);
1485  // We don't need debug information in this function as nothing here refers to
1486  // user code.
1487  CGF.disableDebugInfo();
1488  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
1489 
1490  auto &Bld = CGF.Builder;
1491 
1492  // This array is used as a medium to transfer, one reduce element at a time,
1493  // the data from the first lane of every warp to lanes in the first warp
1494  // in order to perform the final step of a reduction in a parallel region
1495  // (reduction across warps). The array is placed in NVPTX __shared__ memory
1496  // for reduced latency, as well as to have a distinct copy for concurrently
1497  // executing target regions. The array is declared with common linkage so
1498  // as to be shared across compilation units.
1499  const char *TransferMediumName =
1500  "__openmp_nvptx_data_transfer_temporary_storage";
1501  llvm::GlobalVariable *TransferMedium =
1502  M.getGlobalVariable(TransferMediumName);
1503  if (!TransferMedium) {
1504  auto *Ty = llvm::ArrayType::get(CGM.Int64Ty, WarpSize);
1505  unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
1506  TransferMedium = new llvm::GlobalVariable(
1507  M, Ty,
1508  /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
1509  llvm::Constant::getNullValue(Ty), TransferMediumName,
1510  /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
1511  SharedAddressSpace);
1512  }
1513 
1514  // Get the CUDA thread id of the current OpenMP thread on the GPU.
1515  auto *ThreadID = getNVPTXThreadID(CGF);
1516  // nvptx_lane_id = nvptx_id % warpsize
1517  auto *LaneID = getNVPTXLaneID(CGF);
1518  // nvptx_warp_id = nvptx_id / warpsize
1519  auto *WarpID = getNVPTXWarpID(CGF);
1520 
1521  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1522  Address LocalReduceList(
1524  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
1525  C.VoidPtrTy, SourceLocation()),
1526  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1527  CGF.getPointerAlign());
1528 
1529  unsigned Idx = 0;
1530  for (auto &Private : Privates) {
1531  //
1532  // Warp master copies reduce element to transfer medium in __shared__
1533  // memory.
1534  //
1535  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1536  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1537  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1538 
1539  // if (lane_id == 0)
1540  auto IsWarpMaster =
1541  Bld.CreateICmpEQ(LaneID, Bld.getInt32(0), "warp_master");
1542  Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
1543  CGF.EmitBlock(ThenBB);
1544 
1545  // Reduce element = LocalReduceList[i]
1546  Address ElemPtrPtrAddr =
1547  Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
1548  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
1549  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1550  // elemptr = (type[i]*)(elemptrptr)
1551  Address ElemPtr =
1552  Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
1553  ElemPtr = Bld.CreateElementBitCast(
1554  ElemPtr, CGF.ConvertTypeForMem(Private->getType()));
1555  // elem = *elemptr
1556  llvm::Value *Elem = CGF.EmitLoadOfScalar(
1557  ElemPtr, /*Volatile=*/false, Private->getType(), SourceLocation());
1558 
1559  // Get pointer to location in transfer medium.
1560  // MediumPtr = &medium[warp_id]
1561  llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
1562  TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
1563  Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType()));
1564  // Casting to actual data type.
1565  // MediumPtr = (type[i]*)MediumPtrAddr;
1566  MediumPtr = Bld.CreateElementBitCast(
1567  MediumPtr, CGF.ConvertTypeForMem(Private->getType()));
1568 
1569  //*MediumPtr = elem
1570  Bld.CreateStore(Elem, MediumPtr);
1571 
1572  Bld.CreateBr(MergeBB);
1573 
1574  CGF.EmitBlock(ElseBB);
1575  Bld.CreateBr(MergeBB);
1576 
1577  CGF.EmitBlock(MergeBB);
1578 
1579  Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
1580  llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
1581  AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, SourceLocation());
1582 
1583  auto *NumActiveThreads = Bld.CreateNSWMul(
1584  NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads");
1585  // named_barrier_sync(ParallelBarrierID, num_active_threads)
1586  syncParallelThreads(CGF, NumActiveThreads);
1587 
1588  //
1589  // Warp 0 copies reduce element from transfer medium.
1590  //
1591  llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
1592  llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
1593  llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
1594 
1595  // Up to 32 threads in warp 0 are active.
1596  auto IsActiveThread =
1597  Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
1598  Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
1599 
1600  CGF.EmitBlock(W0ThenBB);
1601 
1602  // SrcMediumPtr = &medium[tid]
1603  llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
1604  TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
1605  Address SrcMediumPtr(SrcMediumPtrVal,
1606  C.getTypeAlignInChars(Private->getType()));
1607  // SrcMediumVal = *SrcMediumPtr;
1608  SrcMediumPtr = Bld.CreateElementBitCast(
1609  SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType()));
1610  llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar(
1611  SrcMediumPtr, /*Volatile=*/false, Private->getType(), SourceLocation());
1612 
1613  // TargetElemPtr = (type[i]*)(SrcDataAddr[i])
1614  Address TargetElemPtrPtr =
1615  Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
1616  llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
1617  TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1618  Address TargetElemPtr =
1619  Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType()));
1620  TargetElemPtr = Bld.CreateElementBitCast(
1621  TargetElemPtr, CGF.ConvertTypeForMem(Private->getType()));
1622 
1623  // *TargetElemPtr = SrcMediumVal;
1624  CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
1625  Private->getType());
1626  Bld.CreateBr(W0MergeBB);
1627 
1628  CGF.EmitBlock(W0ElseBB);
1629  Bld.CreateBr(W0MergeBB);
1630 
1631  CGF.EmitBlock(W0MergeBB);
1632 
1633  // While warp 0 copies values from transfer medium, all other warps must
1634  // wait.
1635  syncParallelThreads(CGF, NumActiveThreads);
1636  Idx++;
1637  }
1638 
1639  CGF.FinishFunction();
1640  return Fn;
1641 }
1642 
1643 /// Emit a helper that reduces data across two OpenMP threads (lanes)
1644 /// in the same warp. It uses shuffle instructions to copy over data from
1645 /// a remote lane's stack. The reduction algorithm performed is specified
1646 /// by the fourth parameter.
1647 ///
1648 /// Algorithm Versions.
1649 /// Full Warp Reduce (argument value 0):
1650 /// This algorithm assumes that all 32 lanes are active and gathers
1651 /// data from these 32 lanes, producing a single resultant value.
1652 /// Contiguous Partial Warp Reduce (argument value 1):
1653 /// This algorithm assumes that only a *contiguous* subset of lanes
1654 /// are active. This happens for the last warp in a parallel region
1655 /// when the user specified num_threads is not an integer multiple of
1656 /// 32. This contiguous subset always starts with the zeroth lane.
1657 /// Partial Warp Reduce (argument value 2):
1658 /// This algorithm gathers data from any number of lanes at any position.
1659 /// All reduced values are stored in the lowest possible lane. The set
1660 /// of problems every algorithm addresses is a super set of those
1661 /// addressable by algorithms with a lower version number. Overhead
1662 /// increases as algorithm version increases.
1663 ///
1664 /// Terminology
1665 /// Reduce element:
1666 /// Reduce element refers to the individual data field with primitive
1667 /// data types to be combined and reduced across threads.
1668 /// Reduce list:
1669 /// Reduce list refers to a collection of local, thread-private
1670 /// reduce elements.
1671 /// Remote Reduce list:
1672 /// Remote Reduce list refers to a collection of remote (relative to
1673 /// the current thread) reduce elements.
1674 ///
1675 /// We distinguish between three states of threads that are important to
1676 /// the implementation of this function.
1677 /// Alive threads:
1678 /// Threads in a warp executing the SIMT instruction, as distinguished from
1679 /// threads that are inactive due to divergent control flow.
1680 /// Active threads:
1681 /// The minimal set of threads that has to be alive upon entry to this
1682 /// function. The computation is correct iff active threads are alive.
1683 /// Some threads are alive but they are not active because they do not
1684 /// contribute to the computation in any useful manner. Turning them off
1685 /// may introduce control flow overheads without any tangible benefits.
1686 /// Effective threads:
1687 /// In order to comply with the argument requirements of the shuffle
1688 /// function, we must keep all lanes holding data alive. But at most
1689 /// half of them perform value aggregation; we refer to this half of
1690 /// threads as effective. The other half is simply handing off their
1691 /// data.
1692 ///
1693 /// Procedure
1694 /// Value shuffle:
1695 /// In this step active threads transfer data from higher lane positions
1696 /// in the warp to lower lane positions, creating Remote Reduce list.
1697 /// Value aggregation:
1698 /// In this step, effective threads combine their thread local Reduce list
1699 /// with Remote Reduce list and store the result in the thread local
1700 /// Reduce list.
1701 /// Value copy:
1702 /// In this step, we deal with the assumption made by algorithm 2
1703 /// (i.e. contiguity assumption). When we have an odd number of lanes
1704 /// active, say 2k+1, only k threads will be effective and therefore k
1705 /// new values will be produced. However, the Reduce list owned by the
1706 /// (2k+1)th thread is ignored in the value aggregation. Therefore
1707 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
1708 /// that the contiguity assumption still holds.
1709 static llvm::Value *
1711  ArrayRef<const Expr *> Privates,
1712  QualType ReductionArrayTy, llvm::Value *ReduceFn) {
1713  auto &C = CGM.getContext();
1714 
1715  // Thread local Reduce list used to host the values of data to be reduced.
1716  ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
1717  // Current lane id; could be logical.
1718  ImplicitParamDecl LaneIDArg(C, C.ShortTy, ImplicitParamDecl::Other);
1719  // Offset of the remote source lane relative to the current lane.
1720  ImplicitParamDecl RemoteLaneOffsetArg(C, C.ShortTy,
1722  // Algorithm version. This is expected to be known at compile time.
1723  ImplicitParamDecl AlgoVerArg(C, C.ShortTy, ImplicitParamDecl::Other);
1724  FunctionArgList Args;
1725  Args.push_back(&ReduceListArg);
1726  Args.push_back(&LaneIDArg);
1727  Args.push_back(&RemoteLaneOffsetArg);
1728  Args.push_back(&AlgoVerArg);
1729 
1730  auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1731  auto *Fn = llvm::Function::Create(
1733  "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
1734  CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, CGFI);
1735  CodeGenFunction CGF(CGM);
1736  // We don't need debug information in this function as nothing here refers to
1737  // user code.
1738  CGF.disableDebugInfo();
1739  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
1740 
1741  auto &Bld = CGF.Builder;
1742 
1743  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1744  Address LocalReduceList(
1746  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
1747  C.VoidPtrTy, SourceLocation()),
1748  CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1749  CGF.getPointerAlign());
1750 
1751  Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
1752  llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
1753  AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
1754 
1755  Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
1756  llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
1757  AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
1758 
1759  Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
1760  llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
1761  AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
1762 
1763  // Create a local thread-private variable to host the Reduce list
1764  // from a remote lane.
1765  Address RemoteReduceList =
1766  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
1767 
1768  // This loop iterates through the list of reduce elements and copies,
1769  // element by element, from a remote lane in the warp to RemoteReduceList,
1770  // hosted on the thread's stack.
1771  emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
1772  LocalReduceList, RemoteReduceList,
1773  {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
1774  /*ScratchpadIndex=*/nullptr,
1775  /*ScratchpadWidth=*/nullptr});
1776 
1777  // The actions to be performed on the Remote Reduce list is dependent
1778  // on the algorithm version.
1779  //
1780  // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
1781  // LaneId % 2 == 0 && Offset > 0):
1782  // do the reduction value aggregation
1783  //
1784  // The thread local variable Reduce list is mutated in place to host the
1785  // reduced data, which is the aggregated value produced from local and
1786  // remote lanes.
1787  //
1788  // Note that AlgoVer is expected to be a constant integer known at compile
1789  // time.
1790  // When AlgoVer==0, the first conjunction evaluates to true, making
1791  // the entire predicate true during compile time.
1792  // When AlgoVer==1, the second conjunction has only the second part to be
1793  // evaluated during runtime. Other conjunctions evaluates to false
1794  // during compile time.
1795  // When AlgoVer==2, the third conjunction has only the second part to be
1796  // evaluated during runtime. Other conjunctions evaluates to false
1797  // during compile time.
1798  auto CondAlgo0 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(0));
1799 
1800  auto Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
1801  auto CondAlgo1 = Bld.CreateAnd(
1802  Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
1803 
1804  auto Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
1805  auto CondAlgo2 = Bld.CreateAnd(
1806  Algo2,
1807  Bld.CreateICmpEQ(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1)),
1808  Bld.getInt16(0)));
1809  CondAlgo2 = Bld.CreateAnd(
1810  CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
1811 
1812  auto CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
1813  CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
1814 
1815  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1816  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1817  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1818  Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
1819 
1820  CGF.EmitBlock(ThenBB);
1821  // reduce_function(LocalReduceList, RemoteReduceList)
1822  llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1823  LocalReduceList.getPointer(), CGF.VoidPtrTy);
1824  llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1825  RemoteReduceList.getPointer(), CGF.VoidPtrTy);
1826  CGF.EmitCallOrInvoke(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
1827  Bld.CreateBr(MergeBB);
1828 
1829  CGF.EmitBlock(ElseBB);
1830  Bld.CreateBr(MergeBB);
1831 
1832  CGF.EmitBlock(MergeBB);
1833 
1834  // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
1835  // Reduce list.
1836  Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
1837  auto CondCopy = Bld.CreateAnd(
1838  Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
1839 
1840  llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
1841  llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
1842  llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
1843  Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
1844 
1845  CGF.EmitBlock(CpyThenBB);
1846  emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
1847  RemoteReduceList, LocalReduceList);
1848  Bld.CreateBr(CpyMergeBB);
1849 
1850  CGF.EmitBlock(CpyElseBB);
1851  Bld.CreateBr(CpyMergeBB);
1852 
1853  CGF.EmitBlock(CpyMergeBB);
1854 
1855  CGF.FinishFunction();
1856  return Fn;
1857 }
1858 
1859 ///
1860 /// Design of OpenMP reductions on the GPU
1861 ///
1862 /// Consider a typical OpenMP program with one or more reduction
1863 /// clauses:
1864 ///
1865 /// float foo;
1866 /// double bar;
1867 /// #pragma omp target teams distribute parallel for \
1868 /// reduction(+:foo) reduction(*:bar)
1869 /// for (int i = 0; i < N; i++) {
1870 /// foo += A[i]; bar *= B[i];
1871 /// }
1872 ///
1873 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
1874 /// all teams. In our OpenMP implementation on the NVPTX device an
1875 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1876 /// within a team are mapped to CUDA threads within a threadblock.
1877 /// Our goal is to efficiently aggregate values across all OpenMP
1878 /// threads such that:
1879 ///
1880 /// - the compiler and runtime are logically concise, and
1881 /// - the reduction is performed efficiently in a hierarchical
1882 /// manner as follows: within OpenMP threads in the same warp,
1883 /// across warps in a threadblock, and finally across teams on
1884 /// the NVPTX device.
1885 ///
1886 /// Introduction to Decoupling
1887 ///
1888 /// We would like to decouple the compiler and the runtime so that the
1889 /// latter is ignorant of the reduction variables (number, data types)
1890 /// and the reduction operators. This allows a simpler interface
1891 /// and implementation while still attaining good performance.
1892 ///
1893 /// Pseudocode for the aforementioned OpenMP program generated by the
1894 /// compiler is as follows:
1895 ///
1896 /// 1. Create private copies of reduction variables on each OpenMP
1897 /// thread: 'foo_private', 'bar_private'
1898 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1899 /// to it and writes the result in 'foo_private' and 'bar_private'
1900 /// respectively.
1901 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
1902 /// and store the result on the team master:
1903 ///
1904 /// __kmpc_nvptx_parallel_reduce_nowait(...,
1905 /// reduceData, shuffleReduceFn, interWarpCpyFn)
1906 ///
1907 /// where:
1908 /// struct ReduceData {
1909 /// double *foo;
1910 /// double *bar;
1911 /// } reduceData
1912 /// reduceData.foo = &foo_private
1913 /// reduceData.bar = &bar_private
1914 ///
1915 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1916 /// auxiliary functions generated by the compiler that operate on
1917 /// variables of type 'ReduceData'. They aid the runtime perform
1918 /// algorithmic steps in a data agnostic manner.
1919 ///
1920 /// 'shuffleReduceFn' is a pointer to a function that reduces data
1921 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
1922 /// same warp. It takes the following arguments as input:
1923 ///
1924 /// a. variable of type 'ReduceData' on the calling lane,
1925 /// b. its lane_id,
1926 /// c. an offset relative to the current lane_id to generate a
1927 /// remote_lane_id. The remote lane contains the second
1928 /// variable of type 'ReduceData' that is to be reduced.
1929 /// d. an algorithm version parameter determining which reduction
1930 /// algorithm to use.
1931 ///
1932 /// 'shuffleReduceFn' retrieves data from the remote lane using
1933 /// efficient GPU shuffle intrinsics and reduces, using the
1934 /// algorithm specified by the 4th parameter, the two operands
1935 /// element-wise. The result is written to the first operand.
1936 ///
1937 /// Different reduction algorithms are implemented in different
1938 /// runtime functions, all calling 'shuffleReduceFn' to perform
1939 /// the essential reduction step. Therefore, based on the 4th
1940 /// parameter, this function behaves slightly differently to
1941 /// cooperate with the runtime to ensure correctness under
1942 /// different circumstances.
1943 ///
1944 /// 'InterWarpCpyFn' is a pointer to a function that transfers
1945 /// reduced variables across warps. It tunnels, through CUDA
1946 /// shared memory, the thread-private data of type 'ReduceData'
1947 /// from lane 0 of each warp to a lane in the first warp.
1948 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
1949 /// The last team writes the global reduced value to memory.
1950 ///
1951 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
1952 /// reduceData, shuffleReduceFn, interWarpCpyFn,
1953 /// scratchpadCopyFn, loadAndReduceFn)
1954 ///
1955 /// 'scratchpadCopyFn' is a helper that stores reduced
1956 /// data from the team master to a scratchpad array in
1957 /// global memory.
1958 ///
1959 /// 'loadAndReduceFn' is a helper that loads data from
1960 /// the scratchpad array and reduces it with the input
1961 /// operand.
1962 ///
1963 /// These compiler generated functions hide address
1964 /// calculation and alignment information from the runtime.
1965 /// 5. if ret == 1:
1966 /// The team master of the last team stores the reduced
1967 /// result to the globals in memory.
1968 /// foo += reduceData.foo; bar *= reduceData.bar
1969 ///
1970 ///
1971 /// Warp Reduction Algorithms
1972 ///
1973 /// On the warp level, we have three algorithms implemented in the
1974 /// OpenMP runtime depending on the number of active lanes:
1975 ///
1976 /// Full Warp Reduction
1977 ///
1978 /// The reduce algorithm within a warp where all lanes are active
1979 /// is implemented in the runtime as follows:
1980 ///
1981 /// full_warp_reduce(void *reduce_data,
1982 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1983 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
1984 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
1985 /// }
1986 ///
1987 /// The algorithm completes in log(2, WARPSIZE) steps.
1988 ///
1989 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
1990 /// not used therefore we save instructions by not retrieving lane_id
1991 /// from the corresponding special registers. The 4th parameter, which
1992 /// represents the version of the algorithm being used, is set to 0 to
1993 /// signify full warp reduction.
1994 ///
1995 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1996 ///
1997 /// #reduce_elem refers to an element in the local lane's data structure
1998 /// #remote_elem is retrieved from a remote lane
1999 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2000 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
2001 ///
2002 /// Contiguous Partial Warp Reduction
2003 ///
2004 /// This reduce algorithm is used within a warp where only the first
2005 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
2006 /// number of OpenMP threads in a parallel region is not a multiple of
2007 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
2008 ///
2009 /// void
2010 /// contiguous_partial_reduce(void *reduce_data,
2011 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
2012 /// int size, int lane_id) {
2013 /// int curr_size;
2014 /// int offset;
2015 /// curr_size = size;
2016 /// mask = curr_size/2;
2017 /// while (offset>0) {
2018 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
2019 /// curr_size = (curr_size+1)/2;
2020 /// offset = curr_size/2;
2021 /// }
2022 /// }
2023 ///
2024 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2025 ///
2026 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2027 /// if (lane_id < offset)
2028 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
2029 /// else
2030 /// reduce_elem = remote_elem
2031 ///
2032 /// This algorithm assumes that the data to be reduced are located in a
2033 /// contiguous subset of lanes starting from the first. When there is
2034 /// an odd number of active lanes, the data in the last lane is not
2035 /// aggregated with any other lane's dat but is instead copied over.
2036 ///
2037 /// Dispersed Partial Warp Reduction
2038 ///
2039 /// This algorithm is used within a warp when any discontiguous subset of
2040 /// lanes are active. It is used to implement the reduction operation
2041 /// across lanes in an OpenMP simd region or in a nested parallel region.
2042 ///
2043 /// void
2044 /// dispersed_partial_reduce(void *reduce_data,
2045 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2046 /// int size, remote_id;
2047 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
2048 /// do {
2049 /// remote_id = next_active_lane_id_right_after_me();
2050 /// # the above function returns 0 of no active lane
2051 /// # is present right after the current lane.
2052 /// size = number_of_active_lanes_in_this_warp();
2053 /// logical_lane_id /= 2;
2054 /// ShuffleReduceFn(reduce_data, logical_lane_id,
2055 /// remote_id-1-threadIdx.x, 2);
2056 /// } while (logical_lane_id % 2 == 0 && size > 1);
2057 /// }
2058 ///
2059 /// There is no assumption made about the initial state of the reduction.
2060 /// Any number of lanes (>=1) could be active at any position. The reduction
2061 /// result is returned in the first active lane.
2062 ///
2063 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2064 ///
2065 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2066 /// if (lane_id % 2 == 0 && offset > 0)
2067 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
2068 /// else
2069 /// reduce_elem = remote_elem
2070 ///
2071 ///
2072 /// Intra-Team Reduction
2073 ///
2074 /// This function, as implemented in the runtime call
2075 /// '__kmpc_nvptx_parallel_reduce_nowait', aggregates data across OpenMP
2076 /// threads in a team. It first reduces within a warp using the
2077 /// aforementioned algorithms. We then proceed to gather all such
2078 /// reduced values at the first warp.
2079 ///
2080 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
2081 /// data from each of the "warp master" (zeroth lane of each warp, where
2082 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
2083 /// a mathematical sense) the problem of reduction across warp masters in
2084 /// a block to the problem of warp reduction.
2085 ///
2086 ///
2087 /// Inter-Team Reduction
2088 ///
2089 /// Once a team has reduced its data to a single value, it is stored in
2090 /// a global scratchpad array. Since each team has a distinct slot, this
2091 /// can be done without locking.
2092 ///
2093 /// The last team to write to the scratchpad array proceeds to reduce the
2094 /// scratchpad array. One or more workers in the last team use the helper
2095 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2096 /// the k'th worker reduces every k'th element.
2097 ///
2098 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait' to
2099 /// reduce across workers and compute a globally reduced value.
2100 ///
2105  if (!CGF.HaveInsertPoint())
2106  return;
2107 
2108  bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
2109  bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
2110  // FIXME: Add support for simd reduction.
2111  assert((TeamsReduction || ParallelReduction) &&
2112  "Invalid reduction selection in emitReduction.");
2113 
2114  auto &C = CGM.getContext();
2115 
2116  // 1. Build a list of reduction variables.
2117  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2118  auto Size = RHSExprs.size();
2119  for (auto *E : Privates) {
2120  if (E->getType()->isVariablyModifiedType())
2121  // Reserve place for array size.
2122  ++Size;
2123  }
2124  llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
2125  QualType ReductionArrayTy =
2126  C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal,
2127  /*IndexTypeQuals=*/0);
2128  Address ReductionList =
2129  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2130  auto IPriv = Privates.begin();
2131  unsigned Idx = 0;
2132  for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
2133  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
2134  CGF.getPointerSize());
2135  CGF.Builder.CreateStore(
2137  CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
2138  Elem);
2139  if ((*IPriv)->getType()->isVariablyModifiedType()) {
2140  // Store array size.
2141  ++Idx;
2142  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
2143  CGF.getPointerSize());
2144  llvm::Value *Size = CGF.Builder.CreateIntCast(
2145  CGF.getVLASize(
2146  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2147  .first,
2148  CGF.SizeTy, /*isSigned=*/false);
2149  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2150  Elem);
2151  }
2152  }
2153 
2154  // 2. Emit reduce_func().
2155  auto *ReductionFn = emitReductionFunction(
2156  CGM, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
2157  LHSExprs, RHSExprs, ReductionOps);
2158 
2159  // 4. Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
2160  // RedList, shuffle_reduce_func, interwarp_copy_func);
2161  auto *ThreadId = getThreadID(CGF, Loc);
2162  auto *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
2164  ReductionList.getPointer(), CGF.VoidPtrTy);
2165 
2166  auto *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
2167  CGM, Privates, ReductionArrayTy, ReductionFn);
2168  auto *InterWarpCopyFn =
2169  emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy);
2170 
2171  llvm::Value *Res = nullptr;
2172  if (ParallelReduction) {
2173  llvm::Value *Args[] = {ThreadId,
2174  CGF.Builder.getInt32(RHSExprs.size()),
2175  ReductionArrayTySize,
2176  RL,
2177  ShuffleAndReduceFn,
2178  InterWarpCopyFn};
2179 
2180  Res = CGF.EmitRuntimeCall(
2181  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait),
2182  Args);
2183  }
2184 
2185  if (TeamsReduction) {
2186  auto *ScratchPadCopyFn =
2187  emitCopyToScratchpad(CGM, Privates, ReductionArrayTy);
2188  auto *LoadAndReduceFn = emitReduceScratchpadFunction(
2189  CGM, Privates, ReductionArrayTy, ReductionFn);
2190 
2191  llvm::Value *Args[] = {ThreadId,
2192  CGF.Builder.getInt32(RHSExprs.size()),
2193  ReductionArrayTySize,
2194  RL,
2195  ShuffleAndReduceFn,
2196  InterWarpCopyFn,
2197  ScratchPadCopyFn,
2198  LoadAndReduceFn};
2199  Res = CGF.EmitRuntimeCall(
2200  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_teams_reduce_nowait),
2201  Args);
2202  }
2203 
2204  // 5. Build switch(res)
2205  auto *DefaultBB = CGF.createBasicBlock(".omp.reduction.default");
2206  auto *SwInst = CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1);
2207 
2208  // 6. Build case 1: where we have reduced values in the master
2209  // thread in each team.
2210  // __kmpc_end_reduce{_nowait}(<gtid>);
2211  // break;
2212  auto *Case1BB = CGF.createBasicBlock(".omp.reduction.case1");
2213  SwInst->addCase(CGF.Builder.getInt32(1), Case1BB);
2214  CGF.EmitBlock(Case1BB);
2215 
2216  // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
2217  llvm::Value *EndArgs[] = {ThreadId};
2218  auto &&CodeGen = [&Privates, &LHSExprs, &RHSExprs, &ReductionOps,
2219  this](CodeGenFunction &CGF, PrePostActionTy &Action) {
2220  auto IPriv = Privates.begin();
2221  auto ILHS = LHSExprs.begin();
2222  auto IRHS = RHSExprs.begin();
2223  for (auto *E : ReductionOps) {
2224  emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
2225  cast<DeclRefExpr>(*IRHS));
2226  ++IPriv;
2227  ++ILHS;
2228  ++IRHS;
2229  }
2230  };
2231  RegionCodeGenTy RCG(CodeGen);
2232  NVPTXActionTy Action(
2233  nullptr, llvm::None,
2234  createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
2235  EndArgs);
2236  RCG.setAction(Action);
2237  RCG(CGF);
2238  CGF.EmitBranch(DefaultBB);
2239  CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
2240 }
const CGFunctionInfo & arrangeBuiltinFunctionDeclaration(QualType resultType, const FunctionArgList &args)
A builtin function is a freestanding function using the default C conventions.
Definition: CGCall.cpp:636
static llvm::Value * emitShuffleAndReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, llvm::Value *ReduceFn)
Emit a helper that reduces data across two OpenMP threads (lanes) in the same warp.
Parameter for captured context.
Definition: Decl.h:1395
A (possibly-)qualified type.
Definition: Type.h:616
llvm::Value * getPointer() const
Definition: CGValue.h:342
llvm::Type * ConvertTypeForMem(QualType T)
static llvm::Value * getNVPTXLaneID(CodeGenFunction &CGF)
Get the id of the current lane in the Warp.
llvm::Module & getModule() const
void emitSingleReductionCombiner(CodeGenFunction &CGF, const Expr *ReductionOp, const Expr *PrivateRef, const DeclRefExpr *LHS, const DeclRefExpr *RHS)
Emits single reduction combiner.
static void getNVPTXBarrier(CodeGenFunction &CGF, int ID, llvm::Value *NumThreads)
Get barrier ID to synchronize selected (multiple of warp size) threads in a CTA.
llvm::Value * emitParallelOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP parallel.
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition: CharUnits.h:179
static CGOpenMPRuntimeNVPTX::ExecutionMode getExecutionModeForDirective(CodeGenModule &CGM, const OMPExecutableDirective &D)
llvm::Value * getTypeSize(QualType Ty)
Returns calculated size of the specified type.
llvm::Value * ScratchpadIndex
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
VarDecl - An instance of this class is created to represent a variable declaration or definition...
Definition: Decl.h:758
llvm::Value * getThreadID(CodeGenFunction &CGF, SourceLocation Loc)
Gets thread id value for the current thread.
OpenMPDirectiveKind getDirectiveKind() const
Definition: StmtOpenMP.h:221
static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy, ArrayRef< const Expr * > Privates, Address SrcBase, Address DestBase, CopyOptionsTy CopyOptions={nullptr, nullptr, nullptr})
Emit instructions to copy a Reduce list, which contains partially aggregated values, in the specified direction.
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::Type * ConvertType(QualType T)
ConvertType - Convert type T into a llvm::Type.
static llvm::Value * getMasterThreadID(CodeGenFunction &CGF)
Get the thread id of the OMP master thread.
llvm::CallInst * EmitRuntimeCall(llvm::Value *callee, const Twine &name="")
static llvm::Value * emitReduceScratchpadFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, llvm::Value *ReduceFn)
This function emits a helper that loads data from the scratchpad array and (optionally) reduces it wi...
llvm::CallSite EmitCallOrInvoke(llvm::Value *Callee, ArrayRef< llvm::Value * > Args, const Twine &Name="")
Emits a call or invoke instruction to the given function, depending on the current state of the EH st...
Definition: CGCall.cpp:3655
FrontendAction * Action
Definition: Tooling.cpp:205
void InitTempAlloca(Address Alloca, llvm::Value *Value)
InitTempAlloca - Provide an initial value for the given alloca which will be observable at all locati...
Definition: CGExpr.cpp:110
llvm::Value * EmitLoadOfScalar(Address Addr, bool Volatile, QualType Ty, SourceLocation Loc, LValueBaseInfo BaseInfo=LValueBaseInfo(AlignmentSource::Type), llvm::MDNode *TBAAInfo=nullptr, QualType TBAABaseTy=QualType(), uint64_t TBAAOffset=0, bool isNontemporal=false)
EmitLoadOfScalar - Load a scalar value from an address, taking care to appropriately convert from the...
Definition: CGExpr.cpp:1437
Address CreateElementBitCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Cast the element type of the given address to a different type, preserving information like the align...
Definition: CGBuilder.h:150
bool isOpenMPTeamsDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a teams-kind directive.
uint32_t Offset
Definition: CacheTokens.cpp:43
virtual void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc) override
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc) override
This function ought to emit, in the general case, a call to.
static llvm::Value * getNVPTXWarpID(CodeGenFunction &CGF)
Get the id of the warp in the block.
Scope - A scope is a transient data structure that is used while parsing the program.
Definition: Scope.h:39
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
llvm::Value * emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, unsigned Flags=0)
Emits object of ident_t type with info for source location.
virtual llvm::Value * emitTeamsOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP teams directive D.
CharUnits getTypeSizeInChars(QualType T) const
Return the size of the specified (complete) type T, in characters.
llvm::AllocaInst * CreateTempAlloca(llvm::Type *Ty, const Twine &Name="tmp", llvm::Value *ArraySize=nullptr)
CreateTempAlloca - This creates an alloca and inserts it into the entry block if ArraySize is nullptr...
Definition: CGExpr.cpp:90
detail::InMemoryDirectory::const_iterator I
llvm::Value * emitReductionFunction(CodeGenModule &CGM, llvm::Type *ArgsType, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps)
Emits reduction function.
llvm::Constant * CreateRuntimeFunction(llvm::FunctionType *Ty, StringRef Name, llvm::AttributeList ExtraAttrs=llvm::AttributeList(), bool Local=false)
Create a new runtime function with the specified type and name.
static void syncCTAThreads(CodeGenFunction &CGF)
Synchronize all GPU threads in a block.
Address CreateDefaultAlignTempAlloca(llvm::Type *Ty, const Twine &Name="tmp")
CreateDefaultAlignedTempAlloca - This creates an alloca with the default ABI alignment of the given L...
Definition: CGExpr.cpp:103
llvm::Value * getPointer() const
Definition: Address.h:38
Expr - This represents one expression.
Definition: Expr.h:105
virtual llvm::Value * emitParallelOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP parallel directive D.
static Address invalid()
Definition: Address.h:35
Enters a new scope for capturing cleanups, all of which will be executed once the scope is exited...
bool isOpenMPParallelDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a parallel-kind directive.
const CGFunctionInfo & arrangeNullaryFunction()
A nullary function is a freestanding function of type 'void ()'.
Definition: CGCall.cpp:678
void SetInternalFunctionAttributes(const Decl *D, llvm::Function *F, const CGFunctionInfo &FI)
Set the attributes on the LLVM function for the given decl and function info.
ASTContext & getContext() const
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition: CharUnits.h:63
void EmitStoreOfScalar(llvm::Value *Value, Address Addr, bool Volatile, QualType Ty, LValueBaseInfo BaseInfo=LValueBaseInfo(AlignmentSource::Type), llvm::MDNode *TBAAInfo=nullptr, bool isInit=false, QualType TBAABaseTy=QualType(), uint64_t TBAAOffset=0, bool isNontemporal=false)
EmitStoreOfScalar - Store a scalar value to an address, taking care to appropriately convert from the...
Definition: CGExpr.cpp:1527
static llvm::Value * emitCopyToScratchpad(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy)
This function emits a helper that stores reduced data from the team master to a scratchpad array in g...
virtual void emitTargetOutlinedFunctionHelper(const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen)
Helper to emit outlined function for 'target' directive.
MachineConfiguration
GPU Configuration: This information can be derived from cuda registers, however, providing compile ti...
OpenMPProcBindClauseKind
OpenMP attributes for 'proc_bind' clause.
Definition: OpenMPKinds.h:51
llvm::Value * ScratchpadWidth
GlobalDecl - represents a global declaration.
Definition: GlobalDecl.h:29
virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps, ReductionOptionsTy Options) override
Emit a code for reduction clause.
llvm::Value * emitTeamsOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP teams.
const MatchFinder::MatchFinderOptions & Options
bool HaveInsertPoint() const
HaveInsertPoint - True if an insertion point is defined.
ASTContext & getContext() const
void emitOMPIfClause(CodeGenFunction &CGF, const Expr *Cond, const RegionCodeGenTy &ThenGen, const RegionCodeGenTy &ElseGen)
Emits code for OpenMP 'if' clause using specified CodeGen function.
Encodes a location in the source.
This is a basic class for representing single OpenMP executable directive.
Definition: StmtOpenMP.h:33
const std::string ID
static OMPLinearClause * Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc, OpenMPLinearClauseKind Modifier, SourceLocation ModifierLoc, SourceLocation ColonLoc, SourceLocation EndLoc, ArrayRef< Expr * > VL, ArrayRef< Expr * > PL, ArrayRef< Expr * > IL, Expr *Step, Expr *CalcStep, Stmt *PreInit, Expr *PostUpdate)
Creates clause with a list of variables VL and a linear step Step.
OpenMPDirectiveKind
OpenMP directives.
Definition: OpenMPKinds.h:23
This file defines OpenMP nodes for declarative directives.
static llvm::Value * emitInterWarpCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy)
This function emits a helper that gathers Reduce lists from the first lane of every active warp to la...
void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars, const Expr *IfCond) override
Emits code for parallel or serial call of the OutlinedFn with variables captured in a record which ad...
An aligned address.
Definition: Address.h:25
void StartFunction(GlobalDecl GD, QualType RetTy, llvm::Function *Fn, const CGFunctionInfo &FnInfo, const FunctionArgList &Args, SourceLocation Loc=SourceLocation(), SourceLocation StartLoc=SourceLocation())
Emit code for the start of a function.
const LangOptions & getLangOpts() const
OpenMPRTLFunctionNVPTX
void setAction(PrePostActionTy &Action) const
virtual void emitProcBindClause(CodeGenFunction &CGF, OpenMPProcBindClauseKind ProcBind, SourceLocation Loc)
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
void FinishFunction(SourceLocation EndLoc=SourceLocation())
FinishFunction - Complete IR generation of the current function.
static llvm::Value * getThreadLimit(CodeGenFunction &CGF, bool IsInSpmdExecutionMode=false)
Get the value of the thread_limit clause in the teams directive.
FunctionArgList - Type for representing both the decl and type of parameters to a function...
Definition: CGCall.h:276
This class organizes the cross-function state that is used while generating LLVM code.
CGOpenMPRuntime & getOpenMPRuntime()
Return a reference to the configured OpenMP runtime.
Class provides a way to call simple version of codegen for OpenMP region, or an advanced with possibl...
static void getNVPTXCTABarrier(CodeGenFunction &CGF)
Get barrier to synchronize all threads in a block.
StringRef Name
Definition: USRFinder.cpp:123
A basic class for pre|post-action for advanced codegen sequence for OpenMP region.
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
Definition: CGBuilder.h:70
static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name, CGOpenMPRuntimeNVPTX::ExecutionMode Mode)
detail::InMemoryDirectory::const_iterator E
llvm::StoreInst * CreateStore(llvm::Value *Val, Address Addr, bool IsVolatile=false)
Definition: CGBuilder.h:108
virtual void emitTargetOutlinedFunction(const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen)
Emit outilined function for 'target' directive.
const VariableArrayType * getAsVariableArrayType(QualType T) const
Definition: ASTContext.h:2238
static llvm::Value * createRuntimeShuffleFunction(CodeGenFunction &CGF, QualType ElemTy, llvm::Value *Elem, llvm::Value *Offset)
This function creates calls to one of two shuffle functions to copy variables between lanes in a warp...
llvm::PointerType * getType() const
Return the type of the pointer value.
Definition: Address.h:44
static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads)
Synchronize worker threads in a parallel region.
llvm::Constant * createNVPTXRuntimeFunction(unsigned Function)
Returns specified OpenMP runtime function for the current OpenMP implementation.
This file defines OpenMP AST classes for executable directives and clauses.
Address CreateConstArrayGEP(Address Addr, uint64_t Index, CharUnits EltSize, const llvm::Twine &Name="")
Given addr = [n x T]* ...
Definition: CGBuilder.h:188
Internal linkage, which indicates that the entity can be referred to from within the translation unit...
Definition: Linkage.h:33
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
Definition: CGStmt.cpp:436
ExecutionMode
Target codegen is specialized based on two programming models: the 'generic' fork-join model of OpenM...
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block, taking care to avoid creation of branches from dummy blocks.
Definition: CGStmt.cpp:456
Privates[]
Gets the list of initial values for linear variables.
Definition: OpenMPClause.h:136
virtual void emitProcBindClause(CodeGenFunction &CGF, OpenMPProcBindClauseKind ProcBind, SourceLocation Loc) override
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
LValue EmitLValue(const Expr *E)
EmitLValue - Emit code to compute a designator that specifies the location of the expression...
Definition: CGExpr.cpp:1082
std::pair< llvm::Value *, QualType > getVLASize(const VariableArrayType *vla)
getVLASize - Returns an LLVM value that corresponds to the size, in non-variably-sized elements...
static llvm::Value * getNVPTXThreadID(CodeGenFunction &CGF)
Get the id of the current thread on the GPU.
llvm::Value * EmitScalarConversion(llvm::Value *Src, QualType SrcTy, QualType DstTy, SourceLocation Loc)
Emit a conversion from the specified type to the specified destination type, both of which are LLVM s...
static llvm::Value * getNVPTXWarpSize(CodeGenFunction &CGF)
Get the GPU warp size.
llvm::Value * RemoteLaneOffset
void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars) override
Emits code for teams call of the OutlinedFn with variables captured in a record which address is stor...
CanQualType BoolTy
Definition: ASTContext.h:964
Address CreatePointerBitCastOrAddrSpaceCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Definition: CGBuilder.h:157
Address CreateMemTemp(QualType T, const Twine &Name="tmp", bool CastToDefaultAddrSpace=true)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignment...
Definition: CGExpr.cpp:123
static llvm::Value * getNVPTXNumThreads(CodeGenFunction &CGF)
Get the maximum number of threads in a block of the GPU.
virtual void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc)
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
llvm::FunctionType * GetFunctionType(const CGFunctionInfo &Info)
GetFunctionType - Get the LLVM function type for.
Definition: CGCall.cpp:1519