20 using namespace clang;
21 using namespace CodeGen;
26 OMPRTL_NVPTX__kmpc_kernel_init,
28 OMPRTL_NVPTX__kmpc_kernel_deinit,
31 OMPRTL_NVPTX__kmpc_spmd_kernel_init,
33 OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,
36 OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
38 OMPRTL_NVPTX__kmpc_kernel_parallel,
40 OMPRTL_NVPTX__kmpc_kernel_end_parallel,
43 OMPRTL_NVPTX__kmpc_serialized_parallel,
46 OMPRTL_NVPTX__kmpc_end_serialized_parallel,
49 OMPRTL_NVPTX__kmpc_shuffle_int32,
52 OMPRTL_NVPTX__kmpc_shuffle_int64,
58 OMPRTL_NVPTX__kmpc_parallel_reduce_nowait,
68 OMPRTL_NVPTX__kmpc_teams_reduce_nowait,
70 OMPRTL_NVPTX__kmpc_end_reduce_nowait
80 llvm::BasicBlock *ContBlock =
nullptr;
86 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
95 CGF.
Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
113 class ExecutionModeRAII {
125 ~ExecutionModeRAII() { Mode = SavedMode; }
137 LaneIDMask = WarpSize - 1,
140 GlobalMemoryAlignment = 256,
154 return Bld.CreateCall(
155 llvm::Intrinsic::getDeclaration(
156 &CGF.
CGM.
getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
163 return Bld.CreateCall(
164 llvm::Intrinsic::getDeclaration(
165 &CGF.
CGM.
getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
189 return Bld.CreateCall(
190 llvm::Intrinsic::getDeclaration(
191 &CGF.
CGM.
getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
198 Bld.CreateCall(llvm::Intrinsic::getDeclaration(
207 llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads};
208 Bld.CreateCall(llvm::Intrinsic::getDeclaration(&CGF.
CGM.
getModule(),
209 llvm::Intrinsic::nvvm_barrier),
227 bool IsInSpmdExecutionMode =
false) {
229 return IsInSpmdExecutionMode
249 return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)),
250 Bld.CreateNot(Mask),
"master_tid");
253 CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
255 : WorkerFn(nullptr), CGFI(nullptr) {
256 createWorkerFunction(CGM);
259 void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
270 bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode()
const {
271 return CurrentExecutionMode == CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
278 switch (DirectiveKind) {
280 case OMPD_target_teams:
281 return CGOpenMPRuntimeNVPTX::ExecutionMode::Generic;
282 case OMPD_target_parallel:
283 return CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
285 llvm_unreachable(
"Unsupported directive on NVPTX device.");
287 llvm_unreachable(
"Unsupported directive on NVPTX device.");
291 StringRef ParentName,
292 llvm::Function *&OutlinedFn,
293 llvm::Constant *&OutlinedFnID,
296 ExecutionModeRAII ModeRAII(CurrentExecutionMode,
297 CGOpenMPRuntimeNVPTX::ExecutionMode::Generic);
298 EntryFunctionState EST;
299 WorkerFunctionState WST(CGM);
305 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
306 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
310 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
311 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
312 : RT(RT), EST(EST), WST(WST) {}
314 RT.emitGenericEntryHeader(CGF, EST, WST);
317 RT.emitGenericEntryFooter(CGF, EST);
319 }
Action(*
this, EST, WST);
322 IsOffloadEntry, CodeGen);
325 emitWorkerFunction(WST);
329 WST.WorkerFn->setName(OutlinedFn->getName() +
"_worker");
333 void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(
CodeGenFunction &CGF,
334 EntryFunctionState &EST,
335 WorkerFunctionState &WST) {
345 Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
354 Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
364 void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(
CodeGenFunction &CGF,
365 EntryFunctionState &EST) {
369 llvm::BasicBlock *TerminateBB = CGF.
createBasicBlock(
".termination.notifier");
382 EST.ExitBB =
nullptr;
386 StringRef ParentName,
387 llvm::Function *&OutlinedFn,
388 llvm::Constant *&OutlinedFnID,
391 ExecutionModeRAII ModeRAII(CurrentExecutionMode,
392 CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd);
393 EntryFunctionState EST;
398 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
403 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
405 : RT(RT), EST(EST), D(D) {}
407 RT.emitSpmdEntryHeader(CGF, EST, D);
410 RT.emitSpmdEntryFooter(CGF, EST);
415 IsOffloadEntry, CodeGen);
419 void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader(
442 EntryFunctionState &EST) {
456 EST.ExitBB =
nullptr;
467 (void)
new llvm::GlobalVariable(
469 llvm::GlobalValue::WeakAnyLinkage,
470 llvm::ConstantInt::get(CGM.
Int8Ty, Mode), Name + Twine(
"_exec_mode"));
473 void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
479 emitWorkerLoop(CGF, WST);
484 WorkerFunctionState &WST) {
497 llvm::BasicBlock *SelectWorkersBB = CGF.
createBasicBlock(
".select.workers");
499 llvm::BasicBlock *TerminateBB = CGF.
createBasicBlock(
".terminate.parallel");
519 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
524 Bld.CreateIsNull(Bld.
CreateLoad(WorkFn),
"should_terminate");
525 Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
530 Bld.CreateIsNotNull(Bld.
CreateLoad(ExecStatus),
"is_active");
531 Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
537 for (
auto *W : Work) {
542 Bld.CreateICmpEQ(Bld.
CreateLoad(WorkFn),
ID,
"work_match");
546 Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
553 auto *Fn = cast<llvm::Function>(W);
557 llvm::Value *FnArgs[] = {ZeroAddr.getPointer(), ZeroAddr.getPointer()};
569 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
588 CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(
unsigned Function) {
589 llvm::Constant *RTLFn =
nullptr;
590 switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
591 case OMPRTL_NVPTX__kmpc_kernel_init: {
593 llvm::Type *TypeParams[] = {CGM.
Int32Ty};
594 llvm::FunctionType *FnTy =
595 llvm::FunctionType::get(CGM.
VoidTy, TypeParams,
false);
599 case OMPRTL_NVPTX__kmpc_kernel_deinit: {
601 llvm::FunctionType *FnTy =
606 case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
610 llvm::FunctionType *FnTy =
611 llvm::FunctionType::get(CGM.
VoidTy, TypeParams,
false);
615 case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: {
617 llvm::FunctionType *FnTy =
622 case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
625 llvm::Type *TypeParams[] = {CGM.
Int8PtrTy};
626 llvm::FunctionType *FnTy =
627 llvm::FunctionType::get(CGM.
VoidTy, TypeParams,
false);
631 case OMPRTL_NVPTX__kmpc_kernel_parallel: {
635 llvm::FunctionType *FnTy =
636 llvm::FunctionType::get(RetTy, TypeParams,
false);
640 case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
642 llvm::FunctionType *FnTy =
647 case OMPRTL_NVPTX__kmpc_serialized_parallel: {
650 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.
Int32Ty};
651 llvm::FunctionType *FnTy =
652 llvm::FunctionType::get(CGM.
VoidTy, TypeParams,
false);
656 case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
659 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.
Int32Ty};
660 llvm::FunctionType *FnTy =
661 llvm::FunctionType::get(CGM.
VoidTy, TypeParams,
false);
665 case OMPRTL_NVPTX__kmpc_shuffle_int32: {
669 llvm::FunctionType *FnTy =
670 llvm::FunctionType::get(CGM.
Int32Ty, TypeParams,
false);
674 case OMPRTL_NVPTX__kmpc_shuffle_int64: {
678 llvm::FunctionType *FnTy =
679 llvm::FunctionType::get(CGM.
Int64Ty, TypeParams,
false);
683 case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: {
691 auto *ShuffleReduceFnTy =
692 llvm::FunctionType::get(CGM.
VoidTy, ShuffleReduceTypeParams,
695 auto *InterWarpCopyFnTy =
696 llvm::FunctionType::get(CGM.
VoidTy, InterWarpCopyTypeParams,
698 llvm::Type *TypeParams[] = {CGM.
Int32Ty,
702 ShuffleReduceFnTy->getPointerTo(),
703 InterWarpCopyFnTy->getPointerTo()};
704 llvm::FunctionType *FnTy =
705 llvm::FunctionType::get(CGM.
Int32Ty, TypeParams,
false);
707 FnTy,
"__kmpc_nvptx_parallel_reduce_nowait");
710 case OMPRTL_NVPTX__kmpc_teams_reduce_nowait: {
722 auto *ShuffleReduceFnTy =
723 llvm::FunctionType::get(CGM.
VoidTy, ShuffleReduceTypeParams,
726 auto *InterWarpCopyFnTy =
727 llvm::FunctionType::get(CGM.
VoidTy, InterWarpCopyTypeParams,
731 auto *CopyToScratchpadFnTy =
732 llvm::FunctionType::get(CGM.
VoidTy, CopyToScratchpadTypeParams,
734 llvm::Type *LoadReduceTypeParams[] = {
736 auto *LoadReduceFnTy =
737 llvm::FunctionType::get(CGM.
VoidTy, LoadReduceTypeParams,
739 llvm::Type *TypeParams[] = {CGM.
Int32Ty,
743 ShuffleReduceFnTy->getPointerTo(),
744 InterWarpCopyFnTy->getPointerTo(),
745 CopyToScratchpadFnTy->getPointerTo(),
746 LoadReduceFnTy->getPointerTo()};
747 llvm::FunctionType *FnTy =
748 llvm::FunctionType::get(CGM.
Int32Ty, TypeParams,
false);
750 FnTy,
"__kmpc_nvptx_teams_reduce_nowait");
753 case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
755 llvm::Type *TypeParams[] = {CGM.
Int32Ty};
756 llvm::FunctionType *FnTy =
757 llvm::FunctionType::get(CGM.
VoidTy, TypeParams,
false);
759 FnTy,
"__kmpc_nvptx_end_reduce_nowait");
766 void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *
ID,
767 llvm::Constant *Addr,
768 uint64_t Size, int32_t) {
769 auto *F = dyn_cast<llvm::Function>(Addr);
774 llvm::Module *M = F->getParent();
775 llvm::LLVMContext &Ctx = M->getContext();
778 llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata(
"nvvm.annotations");
780 llvm::Metadata *MDVals[] = {
781 llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx,
"kernel"),
782 llvm::ConstantAsMetadata::get(
783 llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
785 MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
790 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
795 assert(!ParentName.empty() &&
"Invalid target region parent name!");
800 case CGOpenMPRuntimeNVPTX::ExecutionMode::Generic:
801 emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
804 case CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd:
805 emitSpmdKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
810 "Unknown programming model for OpenMP directive on NVPTX target.");
819 llvm_unreachable(
"OpenMP NVPTX can only handle device code.");
827 if (isInSpmdExecutionMode())
838 if (isInSpmdExecutionMode())
845 const Expr *NumTeams,
846 const Expr *ThreadLimit,
853 InnermostKind, CodeGen);
861 D, ThreadIDVar, InnermostKind, CodeGen);
862 llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
863 OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
864 OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
865 OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
883 OutlinedFnArgs.push_back(ZeroAddr.
getPointer());
884 OutlinedFnArgs.push_back(ZeroAddr.
getPointer());
885 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
895 if (isInSpmdExecutionMode())
896 emitSpmdParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
898 emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
901 void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
904 llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
942 OutlinedFnArgs.push_back(
943 llvm::ConstantPointerNull::get(CGM.
Int32Ty->getPointerTo()));
944 OutlinedFnArgs.push_back(
945 llvm::ConstantPointerNull::get(CGM.
Int32Ty->getPointerTo()));
946 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
947 CGF.EmitCallOrInvoke(Fn, OutlinedFnArgs);
969 void CGOpenMPRuntimeNVPTX::emitSpmdParallelCall(
978 OutlinedFnArgs.push_back(
979 llvm::ConstantPointerNull::get(CGM.
Int32Ty->getPointerTo()));
980 OutlinedFnArgs.push_back(
981 llvm::ConstantPointerNull::get(CGM.
Int32Ty->getPointerTo()));
982 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
999 assert(Size <= 8 &&
"Unsupported bitwidth in shuffle instruction.");
1002 ? OMPRTL_NVPTX__kmpc_shuffle_int32
1003 : OMPRTL_NVPTX__kmpc_shuffle_int64;
1007 auto *ElemCast = Bld.CreateSExtOrBitCast(Elem, CastTy);
1014 {ElemCast,
Offset, WarpSize});
1047 auto &CGM = CGF.
CGM;
1051 auto *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1052 auto *ScratchpadIndex = CopyOptions.ScratchpadIndex;
1053 auto *ScratchpadWidth = CopyOptions.ScratchpadWidth;
1058 unsigned Size = Privates.size();
1059 for (
auto &Private : Privates) {
1064 bool ShuffleInElement =
false;
1067 bool UpdateDestListPtr =
false;
1070 bool IncrScratchpadSrc =
false;
1071 bool IncrScratchpadDest =
false;
1074 case RemoteLaneToThread: {
1081 Address(SrcElementPtrPtr,
C.getTypeAlignInChars(Private->getType()));
1085 DestElementPtrAddr =
1088 CGF.
CreateMemTemp(Private->getType(),
".omp.reduction.element");
1089 ShuffleInElement =
true;
1090 UpdateDestListPtr =
true;
1100 Address(SrcElementPtrPtr,
C.getTypeAlignInChars(Private->getType()));
1104 DestElementPtrAddr =
1110 Address(DestElementPtr,
C.getTypeAlignInChars(Private->getType()));
1115 case ThreadToScratchpad: {
1122 Address(SrcElementPtrPtr,
C.getTypeAlignInChars(Private->getType()));
1126 unsigned ElementSizeInChars =
1127 C.getTypeSizeInChars(Private->getType()).getQuantity();
1128 auto *CurrentOffset =
1129 Bld.CreateMul(llvm::ConstantInt::get(CGM.
SizeTy, ElementSizeInChars),
1131 auto *ScratchPadElemAbsolutePtrVal =
1132 Bld.CreateAdd(DestBase.
getPointer(), CurrentOffset);
1133 ScratchPadElemAbsolutePtrVal =
1134 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.
VoidPtrTy);
1136 Address(ScratchPadElemAbsolutePtrVal,
1137 C.getTypeAlignInChars(Private->getType()));
1140 IncrScratchpadDest =
true;
1143 case ScratchpadToThread: {
1146 unsigned ElementSizeInChars =
1147 C.getTypeSizeInChars(Private->getType()).getQuantity();
1148 auto *CurrentOffset =
1149 Bld.CreateMul(llvm::ConstantInt::get(CGM.
SizeTy, ElementSizeInChars),
1151 auto *ScratchPadElemAbsolutePtrVal =
1152 Bld.CreateAdd(SrcBase.
getPointer(), CurrentOffset);
1153 ScratchPadElemAbsolutePtrVal =
1154 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.
VoidPtrTy);
1155 SrcElementAddr =
Address(ScratchPadElemAbsolutePtrVal,
1156 C.getTypeAlignInChars(Private->getType()));
1157 IncrScratchpadSrc =
true;
1161 DestElementPtrAddr =
1164 CGF.
CreateMemTemp(Private->getType(),
".omp.reduction.element");
1165 UpdateDestListPtr =
true;
1180 if (ShuffleInElement) {
1194 if (UpdateDestListPtr) {
1197 DestElementPtrAddr,
false,
1204 if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
1207 unsigned ElementSizeInChars =
1208 C.getTypeSizeInChars(Private->getType()).getQuantity();
1209 ScratchpadBasePtr = Bld.CreateAdd(
1211 Bld.CreateMul(ScratchpadWidth, llvm::ConstantInt::get(
1212 CGM.
SizeTy, ElementSizeInChars)));
1215 ScratchpadBasePtr = Bld.CreateSub(ScratchpadBasePtr,
1216 llvm::ConstantInt::get(CGM.
SizeTy, 1));
1217 ScratchpadBasePtr = Bld.CreateSDiv(
1219 llvm::ConstantInt::get(CGM.
SizeTy, GlobalMemoryAlignment));
1220 ScratchpadBasePtr = Bld.CreateAdd(ScratchpadBasePtr,
1221 llvm::ConstantInt::get(CGM.
SizeTy, 1));
1222 ScratchpadBasePtr = Bld.CreateMul(
1224 llvm::ConstantInt::get(CGM.
SizeTy, GlobalMemoryAlignment));
1226 if (IncrScratchpadDest)
1252 auto Int32Ty = C.getIntTypeForBitwidth(32,
true);
1270 Args.push_back(&ReduceListArg);
1271 Args.push_back(&ScratchPadArg);
1272 Args.push_back(&IndexArg);
1273 Args.push_back(&WidthArg);
1274 Args.push_back(&ShouldReduceArg);
1279 "_omp_reduction_load_and_reduce", &CGM.
getModule());
1320 Bld.CreatePtrToInt(ScratchPadBase, CGM.
SizeTy);
1326 CGF.
CreateMemTemp(ReductionArrayTy,
".omp.reduction.remote_red_list");
1330 SrcDataAddr, RemoteReduceList,
1339 auto CondReduce = Bld.CreateICmpEQ(ShouldReduceVal, Bld.getInt32(1));
1340 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
1346 ReduceListAddr.getPointer(), CGF.
VoidPtrTy);
1348 RemoteReduceList.getPointer(), CGF.
VoidPtrTy);
1350 Bld.CreateBr(MergeBB);
1356 RemoteReduceList, ReduceListAddr);
1357 Bld.CreateBr(MergeBB);
1376 auto Int32Ty = C.getIntTypeForBitwidth(32,
true);
1391 Args.push_back(&ReduceListArg);
1392 Args.push_back(&ScratchPadArg);
1393 Args.push_back(&IndexArg);
1394 Args.push_back(&WidthArg);
1399 "_omp_reduction_copy_to_scratchpad", &CGM.
getModule());
1435 Bld.CreatePtrToInt(ScratchPadBase, CGM.
SizeTy);
1439 SrcDataAddr, DestDataAddr,
1473 C.getIntTypeForBitwidth(32,
true),
1476 Args.push_back(&ReduceListArg);
1477 Args.push_back(&NumWarpsArg);
1482 "_omp_reduction_inter_warp_copy_func", &CGM.
getModule());
1499 const char *TransferMediumName =
1500 "__openmp_nvptx_data_transfer_temporary_storage";
1501 llvm::GlobalVariable *TransferMedium =
1502 M.getGlobalVariable(TransferMediumName);
1503 if (!TransferMedium) {
1504 auto *Ty = llvm::ArrayType::get(CGM.
Int64Ty, WarpSize);
1506 TransferMedium =
new llvm::GlobalVariable(
1508 false, llvm::GlobalVariable::CommonLinkage,
1509 llvm::Constant::getNullValue(Ty), TransferMediumName,
1510 nullptr, llvm::GlobalVariable::NotThreadLocal,
1511 SharedAddressSpace);
1530 for (
auto &Private : Privates) {
1541 Bld.CreateICmpEQ(LaneID, Bld.getInt32(0),
"warp_master");
1542 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
1552 Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
1561 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
1562 TransferMedium, {llvm::Constant::getNullValue(CGM.
Int64Ty), WarpID});
1563 Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType()));
1572 Bld.CreateBr(MergeBB);
1575 Bld.CreateBr(MergeBB);
1583 auto *NumActiveThreads = Bld.CreateNSWMul(
1596 auto IsActiveThread =
1597 Bld.CreateICmpULT(ThreadID, NumWarpsVal,
"is_active_thread");
1598 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
1603 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
1604 TransferMedium, {llvm::Constant::getNullValue(CGM.
Int64Ty), ThreadID});
1605 Address SrcMediumPtr(SrcMediumPtrVal,
1606 C.getTypeAlignInChars(Private->getType()));
1619 Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType()));
1626 Bld.CreateBr(W0MergeBB);
1629 Bld.CreateBr(W0MergeBB);
1725 Args.push_back(&ReduceListArg);
1726 Args.push_back(&LaneIDArg);
1727 Args.push_back(&RemoteLaneOffsetArg);
1728 Args.push_back(&AlgoVerArg);
1733 "_omp_reduction_shuffle_and_reduce_func", &CGM.
getModule());
1766 CGF.
CreateMemTemp(ReductionArrayTy,
".omp.reduction.remote_reduce_list");
1772 LocalReduceList, RemoteReduceList,
1773 {RemoteLaneOffsetArgVal,
1798 auto CondAlgo0 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(0));
1800 auto Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
1801 auto CondAlgo1 = Bld.CreateAnd(
1802 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
1804 auto Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
1805 auto CondAlgo2 = Bld.CreateAnd(
1807 Bld.CreateICmpEQ(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1)),
1809 CondAlgo2 = Bld.CreateAnd(
1810 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
1812 auto CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
1813 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
1818 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
1823 LocalReduceList.getPointer(), CGF.
VoidPtrTy);
1827 Bld.CreateBr(MergeBB);
1830 Bld.CreateBr(MergeBB);
1836 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
1837 auto CondCopy = Bld.CreateAnd(
1838 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
1843 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
1847 RemoteReduceList, LocalReduceList);
1848 Bld.CreateBr(CpyMergeBB);
1851 Bld.CreateBr(CpyMergeBB);
2111 assert((TeamsReduction || ParallelReduction) &&
2112 "Invalid reduction selection in emitReduction.");
2118 auto Size = RHSExprs.size();
2119 for (
auto *
E : Privates) {
2120 if (
E->getType()->isVariablyModifiedType())
2124 llvm::APInt ArraySize(32, Size);
2129 CGF.
CreateMemTemp(ReductionArrayTy,
".omp.reduction.red_list");
2130 auto IPriv = Privates.begin();
2132 for (
unsigned I = 0,
E = RHSExprs.size();
I <
E; ++
I, ++IPriv, ++Idx) {
2139 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2157 LHSExprs, RHSExprs, ReductionOps);
2162 auto *ReductionArrayTySize = CGF.
getTypeSize(ReductionArrayTy);
2167 CGM, Privates, ReductionArrayTy, ReductionFn);
2168 auto *InterWarpCopyFn =
2172 if (ParallelReduction) {
2174 CGF.
Builder.getInt32(RHSExprs.size()),
2175 ReductionArrayTySize,
2185 if (TeamsReduction) {
2186 auto *ScratchPadCopyFn =
2189 CGM, Privates, ReductionArrayTy, ReductionFn);
2192 CGF.
Builder.getInt32(RHSExprs.size()),
2193 ReductionArrayTySize,
2206 auto *SwInst = CGF.
Builder.CreateSwitch(Res, DefaultBB, 1);
2213 SwInst->addCase(CGF.
Builder.getInt32(1), Case1BB);
2218 auto &&CodeGen = [&
Privates, &LHSExprs, &RHSExprs, &ReductionOps,
2220 auto IPriv = Privates.begin();
2221 auto ILHS = LHSExprs.begin();
2222 auto IRHS = RHSExprs.begin();
2223 for (
auto *E : ReductionOps) {
2225 cast<DeclRefExpr>(*IRHS));
2238 CGF.EmitBranch(DefaultBB);
2239 CGF.EmitBlock(DefaultBB,
true);
const CGFunctionInfo & arrangeBuiltinFunctionDeclaration(QualType resultType, const FunctionArgList &args)
A builtin function is a freestanding function using the default C conventions.
llvm::PointerType * Int8PtrPtrTy
static llvm::Value * emitShuffleAndReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, llvm::Value *ReduceFn)
Emit a helper that reduces data across two OpenMP threads (lanes) in the same warp.
Parameter for captured context.
A (possibly-)qualified type.
llvm::Value * getPointer() const
CodeGenTypes & getTypes()
llvm::Type * ConvertTypeForMem(QualType T)
static llvm::Value * getNVPTXLaneID(CodeGenFunction &CGF)
Get the id of the current lane in the Warp.
llvm::Module & getModule() const
void emitSingleReductionCombiner(CodeGenFunction &CGF, const Expr *ReductionOp, const Expr *PrivateRef, const DeclRefExpr *LHS, const DeclRefExpr *RHS)
Emits single reduction combiner.
static void getNVPTXBarrier(CodeGenFunction &CGF, int ID, llvm::Value *NumThreads)
Get barrier ID to synchronize selected (multiple of warp size) threads in a CTA.
llvm::Value * emitParallelOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP parallel.
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
static CGOpenMPRuntimeNVPTX::ExecutionMode getExecutionModeForDirective(CodeGenModule &CGM, const OMPExecutableDirective &D)
llvm::Value * getTypeSize(QualType Ty)
Returns calculated size of the specified type.
llvm::Value * ScratchpadIndex
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
VarDecl - An instance of this class is created to represent a variable declaration or definition...
llvm::Value * getThreadID(CodeGenFunction &CGF, SourceLocation Loc)
Gets thread id value for the current thread.
OpenMPDirectiveKind ReductionKind
OpenMPDirectiveKind getDirectiveKind() const
static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy, ArrayRef< const Expr * > Privates, Address SrcBase, Address DestBase, CopyOptionsTy CopyOptions={nullptr, nullptr, nullptr})
Emit instructions to copy a Reduce list, which contains partially aggregated values, in the specified direction.
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::Type * ConvertType(QualType T)
ConvertType - Convert type T into a llvm::Type.
llvm::IntegerType * Int64Ty
llvm::IntegerType * SizeTy
static llvm::Value * getMasterThreadID(CodeGenFunction &CGF)
Get the thread id of the OMP master thread.
llvm::CallInst * EmitRuntimeCall(llvm::Value *callee, const Twine &name="")
static llvm::Value * emitReduceScratchpadFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, llvm::Value *ReduceFn)
This function emits a helper that loads data from the scratchpad array and (optionally) reduces it wi...
llvm::CallSite EmitCallOrInvoke(llvm::Value *Callee, ArrayRef< llvm::Value * > Args, const Twine &Name="")
Emits a call or invoke instruction to the given function, depending on the current state of the EH st...
void InitTempAlloca(Address Alloca, llvm::Value *Value)
InitTempAlloca - Provide an initial value for the given alloca which will be observable at all locati...
llvm::Value * EmitLoadOfScalar(Address Addr, bool Volatile, QualType Ty, SourceLocation Loc, LValueBaseInfo BaseInfo=LValueBaseInfo(AlignmentSource::Type), llvm::MDNode *TBAAInfo=nullptr, QualType TBAABaseTy=QualType(), uint64_t TBAAOffset=0, bool isNontemporal=false)
EmitLoadOfScalar - Load a scalar value from an address, taking care to appropriately convert from the...
Address CreateElementBitCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Cast the element type of the given address to a different type, preserving information like the align...
bool isOpenMPTeamsDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a teams-kind directive.
llvm::PointerType * VoidPtrTy
virtual void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc) override
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc) override
This function ought to emit, in the general case, a call to.
CharUnits getPointerSize() const
static llvm::Value * getNVPTXWarpID(CodeGenFunction &CGF)
Get the id of the warp in the block.
Scope - A scope is a transient data structure that is used while parsing the program.
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
llvm::Value * emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, unsigned Flags=0)
Emits object of ident_t type with info for source location.
virtual llvm::Value * emitTeamsOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP teams directive D.
CharUnits getTypeSizeInChars(QualType T) const
Return the size of the specified (complete) type T, in characters.
llvm::AllocaInst * CreateTempAlloca(llvm::Type *Ty, const Twine &Name="tmp", llvm::Value *ArraySize=nullptr)
CreateTempAlloca - This creates an alloca and inserts it into the entry block if ArraySize is nullptr...
detail::InMemoryDirectory::const_iterator I
llvm::Value * emitReductionFunction(CodeGenModule &CGM, llvm::Type *ArgsType, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps)
Emits reduction function.
llvm::Constant * CreateRuntimeFunction(llvm::FunctionType *Ty, StringRef Name, llvm::AttributeList ExtraAttrs=llvm::AttributeList(), bool Local=false)
Create a new runtime function with the specified type and name.
static void syncCTAThreads(CodeGenFunction &CGF)
Synchronize all GPU threads in a block.
Address CreateDefaultAlignTempAlloca(llvm::Type *Ty, const Twine &Name="tmp")
CreateDefaultAlignedTempAlloca - This creates an alloca with the default ABI alignment of the given L...
llvm::Value * getPointer() const
Expr - This represents one expression.
virtual llvm::Value * emitParallelOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP parallel directive D.
Enters a new scope for capturing cleanups, all of which will be executed once the scope is exited...
bool isOpenMPParallelDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a parallel-kind directive.
const CGFunctionInfo & arrangeNullaryFunction()
A nullary function is a freestanding function of type 'void ()'.
void SetInternalFunctionAttributes(const Decl *D, llvm::Function *F, const CGFunctionInfo &FI)
Set the attributes on the LLVM function for the given decl and function info.
ASTContext & getContext() const
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
void EmitStoreOfScalar(llvm::Value *Value, Address Addr, bool Volatile, QualType Ty, LValueBaseInfo BaseInfo=LValueBaseInfo(AlignmentSource::Type), llvm::MDNode *TBAAInfo=nullptr, bool isInit=false, QualType TBAABaseTy=QualType(), uint64_t TBAAOffset=0, bool isNontemporal=false)
EmitStoreOfScalar - Store a scalar value to an address, taking care to appropriately convert from the...
llvm::IntegerType * Int32Ty
static llvm::Value * emitCopyToScratchpad(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy)
This function emits a helper that stores reduced data from the team master to a scratchpad array in g...
virtual void emitTargetOutlinedFunctionHelper(const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen)
Helper to emit outlined function for 'target' directive.
MachineConfiguration
GPU Configuration: This information can be derived from cuda registers, however, providing compile ti...
OpenMPProcBindClauseKind
OpenMP attributes for 'proc_bind' clause.
llvm::Value * ScratchpadWidth
GlobalDecl - represents a global declaration.
virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps, ReductionOptionsTy Options) override
Emit a code for reduction clause.
llvm::Value * emitTeamsOutlinedFunction(const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP teams.
const MatchFinder::MatchFinderOptions & Options
bool HaveInsertPoint() const
HaveInsertPoint - True if an insertion point is defined.
ASTContext & getContext() const
void emitOMPIfClause(CodeGenFunction &CGF, const Expr *Cond, const RegionCodeGenTy &ThenGen, const RegionCodeGenTy &ElseGen)
Emits code for OpenMP 'if' clause using specified CodeGen function.
Encodes a location in the source.
CharUnits getPointerAlign() const
This is a basic class for representing single OpenMP executable directive.
static OMPLinearClause * Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc, OpenMPLinearClauseKind Modifier, SourceLocation ModifierLoc, SourceLocation ColonLoc, SourceLocation EndLoc, ArrayRef< Expr * > VL, ArrayRef< Expr * > PL, ArrayRef< Expr * > IL, Expr *Step, Expr *CalcStep, Stmt *PreInit, Expr *PostUpdate)
Creates clause with a list of variables VL and a linear step Step.
llvm::IntegerType * Int16Ty
OpenMPDirectiveKind
OpenMP directives.
This file defines OpenMP nodes for declarative directives.
static llvm::Value * emitInterWarpCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy)
This function emits a helper that gathers Reduce lists from the first lane of every active warp to la...
void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars, const Expr *IfCond) override
Emits code for parallel or serial call of the OutlinedFn with variables captured in a record which ad...
void StartFunction(GlobalDecl GD, QualType RetTy, llvm::Function *Fn, const CGFunctionInfo &FnInfo, const FunctionArgList &Args, SourceLocation Loc=SourceLocation(), SourceLocation StartLoc=SourceLocation())
Emit code for the start of a function.
const LangOptions & getLangOpts() const
void setAction(PrePostActionTy &Action) const
virtual void emitProcBindClause(CodeGenFunction &CGF, OpenMPProcBindClauseKind ProcBind, SourceLocation Loc)
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
void FinishFunction(SourceLocation EndLoc=SourceLocation())
FinishFunction - Complete IR generation of the current function.
static llvm::Value * getThreadLimit(CodeGenFunction &CGF, bool IsInSpmdExecutionMode=false)
Get the value of the thread_limit clause in the teams directive.
FunctionArgList - Type for representing both the decl and type of parameters to a function...
This class organizes the cross-function state that is used while generating LLVM code.
CGOpenMPRuntime & getOpenMPRuntime()
Return a reference to the configured OpenMP runtime.
Class provides a way to call simple version of codegen for OpenMP region, or an advanced with possibl...
static void getNVPTXCTABarrier(CodeGenFunction &CGF)
Get barrier to synchronize all threads in a block.
A basic class for pre|post-action for advanced codegen sequence for OpenMP region.
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name, CGOpenMPRuntimeNVPTX::ExecutionMode Mode)
detail::InMemoryDirectory::const_iterator E
llvm::StoreInst * CreateStore(llvm::Value *Val, Address Addr, bool IsVolatile=false)
virtual void emitTargetOutlinedFunction(const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen)
Emit outilined function for 'target' directive.
const VariableArrayType * getAsVariableArrayType(QualType T) const
static llvm::Value * createRuntimeShuffleFunction(CodeGenFunction &CGF, QualType ElemTy, llvm::Value *Elem, llvm::Value *Offset)
This function creates calls to one of two shuffle functions to copy variables between lanes in a warp...
llvm::PointerType * getType() const
Return the type of the pointer value.
static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads)
Synchronize worker threads in a parallel region.
llvm::Constant * createNVPTXRuntimeFunction(unsigned Function)
Returns specified OpenMP runtime function for the current OpenMP implementation.
This file defines OpenMP AST classes for executable directives and clauses.
Address CreateConstArrayGEP(Address Addr, uint64_t Index, CharUnits EltSize, const llvm::Twine &Name="")
Given addr = [n x T]* ...
llvm::PointerType * Int8PtrTy
Internal linkage, which indicates that the entity can be referred to from within the translation unit...
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
ExecutionMode
Target codegen is specialized based on two programming models: the 'generic' fork-join model of OpenM...
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block, taking care to avoid creation of branches from dummy blocks.
Privates[]
Gets the list of initial values for linear variables.
virtual void emitProcBindClause(CodeGenFunction &CGF, OpenMPProcBindClauseKind ProcBind, SourceLocation Loc) override
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
LValue EmitLValue(const Expr *E)
EmitLValue - Emit code to compute a designator that specifies the location of the expression...
std::pair< llvm::Value *, QualType > getVLASize(const VariableArrayType *vla)
getVLASize - Returns an LLVM value that corresponds to the size, in non-variably-sized elements...
static llvm::Value * getNVPTXThreadID(CodeGenFunction &CGF)
Get the id of the current thread on the GPU.
llvm::Value * EmitScalarConversion(llvm::Value *Src, QualType SrcTy, QualType DstTy, SourceLocation Loc)
Emit a conversion from the specified type to the specified destination type, both of which are LLVM s...
static llvm::Value * getNVPTXWarpSize(CodeGenFunction &CGF)
Get the GPU warp size.
llvm::Value * RemoteLaneOffset
void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars) override
Emits code for teams call of the OutlinedFn with variables captured in a record which address is stor...
Address CreatePointerBitCastOrAddrSpaceCast(Address Addr, llvm::Type *Ty, const llvm::Twine &Name="")
Address CreateMemTemp(QualType T, const Twine &Name="tmp", bool CastToDefaultAddrSpace=true)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignment...
static llvm::Value * getNVPTXNumThreads(CodeGenFunction &CGF)
Get the maximum number of threads in a block of the GPU.
virtual void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc)
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
llvm::FunctionType * GetFunctionType(const CGFunctionInfo &Info)
GetFunctionType - Get the LLVM function type for.