LLVM 20.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/StringRef.h"
29#include "llvm/IR/Attributes.h"
30#include "llvm/IR/BasicBlock.h"
31#include "llvm/IR/CFG.h"
32#include "llvm/IR/CallingConv.h"
33#include "llvm/IR/Constant.h"
34#include "llvm/IR/Constants.h"
35#include "llvm/IR/DIBuilder.h"
38#include "llvm/IR/Function.h"
40#include "llvm/IR/IRBuilder.h"
41#include "llvm/IR/LLVMContext.h"
42#include "llvm/IR/MDBuilder.h"
43#include "llvm/IR/Metadata.h"
45#include "llvm/IR/PassManager.h"
47#include "llvm/IR/Value.h"
59
60#include <cstdint>
61#include <optional>
62
63#define DEBUG_TYPE "openmp-ir-builder"
64
65using namespace llvm;
66using namespace omp;
67
68static cl::opt<bool>
69 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
70 cl::desc("Use optimistic attributes describing "
71 "'as-if' properties of runtime calls."),
72 cl::init(false));
73
75 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
76 cl::desc("Factor for the unroll threshold to account for code "
77 "simplifications still taking place"),
78 cl::init(1.5));
79
80#ifndef NDEBUG
81/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
82/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
83/// an InsertPoint stores the instruction before something is inserted. For
84/// instance, if both point to the same instruction, two IRBuilders alternating
85/// creating instruction will cause the instructions to be interleaved.
88 if (!IP1.isSet() || !IP2.isSet())
89 return false;
90 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
91}
92
94 // Valid ordered/unordered and base algorithm combinations.
95 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
96 case OMPScheduleType::UnorderedStaticChunked:
97 case OMPScheduleType::UnorderedStatic:
98 case OMPScheduleType::UnorderedDynamicChunked:
99 case OMPScheduleType::UnorderedGuidedChunked:
100 case OMPScheduleType::UnorderedRuntime:
101 case OMPScheduleType::UnorderedAuto:
102 case OMPScheduleType::UnorderedTrapezoidal:
103 case OMPScheduleType::UnorderedGreedy:
104 case OMPScheduleType::UnorderedBalanced:
105 case OMPScheduleType::UnorderedGuidedIterativeChunked:
106 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
107 case OMPScheduleType::UnorderedSteal:
108 case OMPScheduleType::UnorderedStaticBalancedChunked:
109 case OMPScheduleType::UnorderedGuidedSimd:
110 case OMPScheduleType::UnorderedRuntimeSimd:
111 case OMPScheduleType::OrderedStaticChunked:
112 case OMPScheduleType::OrderedStatic:
113 case OMPScheduleType::OrderedDynamicChunked:
114 case OMPScheduleType::OrderedGuidedChunked:
115 case OMPScheduleType::OrderedRuntime:
116 case OMPScheduleType::OrderedAuto:
117 case OMPScheduleType::OrderdTrapezoidal:
118 case OMPScheduleType::NomergeUnorderedStaticChunked:
119 case OMPScheduleType::NomergeUnorderedStatic:
120 case OMPScheduleType::NomergeUnorderedDynamicChunked:
121 case OMPScheduleType::NomergeUnorderedGuidedChunked:
122 case OMPScheduleType::NomergeUnorderedRuntime:
123 case OMPScheduleType::NomergeUnorderedAuto:
124 case OMPScheduleType::NomergeUnorderedTrapezoidal:
125 case OMPScheduleType::NomergeUnorderedGreedy:
126 case OMPScheduleType::NomergeUnorderedBalanced:
127 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
128 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
129 case OMPScheduleType::NomergeUnorderedSteal:
130 case OMPScheduleType::NomergeOrderedStaticChunked:
131 case OMPScheduleType::NomergeOrderedStatic:
132 case OMPScheduleType::NomergeOrderedDynamicChunked:
133 case OMPScheduleType::NomergeOrderedGuidedChunked:
134 case OMPScheduleType::NomergeOrderedRuntime:
135 case OMPScheduleType::NomergeOrderedAuto:
136 case OMPScheduleType::NomergeOrderedTrapezoidal:
137 break;
138 default:
139 return false;
140 }
141
142 // Must not set both monotonicity modifiers at the same time.
143 OMPScheduleType MonotonicityFlags =
144 SchedType & OMPScheduleType::MonotonicityMask;
145 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
146 return false;
147
148 return true;
149}
150#endif
151
152static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
153 if (T.isAMDGPU()) {
154 StringRef Features =
155 Kernel->getFnAttribute("target-features").getValueAsString();
156 if (Features.count("+wavefrontsize64"))
157 return omp::getAMDGPUGridValues<64>();
158 return omp::getAMDGPUGridValues<32>();
159 }
160 if (T.isNVPTX())
162 llvm_unreachable("No grid value available for this architecture!");
163}
164
165/// Determine which scheduling algorithm to use, determined from schedule clause
166/// arguments.
167static OMPScheduleType
168getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
169 bool HasSimdModifier) {
170 // Currently, the default schedule it static.
171 switch (ClauseKind) {
172 case OMP_SCHEDULE_Default:
173 case OMP_SCHEDULE_Static:
174 return HasChunks ? OMPScheduleType::BaseStaticChunked
175 : OMPScheduleType::BaseStatic;
176 case OMP_SCHEDULE_Dynamic:
177 return OMPScheduleType::BaseDynamicChunked;
178 case OMP_SCHEDULE_Guided:
179 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
180 : OMPScheduleType::BaseGuidedChunked;
181 case OMP_SCHEDULE_Auto:
183 case OMP_SCHEDULE_Runtime:
184 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
185 : OMPScheduleType::BaseRuntime;
186 }
187 llvm_unreachable("unhandled schedule clause argument");
188}
189
190/// Adds ordering modifier flags to schedule type.
191static OMPScheduleType
193 bool HasOrderedClause) {
194 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
195 OMPScheduleType::None &&
196 "Must not have ordering nor monotonicity flags already set");
197
198 OMPScheduleType OrderingModifier = HasOrderedClause
199 ? OMPScheduleType::ModifierOrdered
200 : OMPScheduleType::ModifierUnordered;
201 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
202
203 // Unsupported combinations
204 if (OrderingScheduleType ==
205 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
206 return OMPScheduleType::OrderedGuidedChunked;
207 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
208 OMPScheduleType::ModifierOrdered))
209 return OMPScheduleType::OrderedRuntime;
210
211 return OrderingScheduleType;
212}
213
214/// Adds monotonicity modifier flags to schedule type.
215static OMPScheduleType
217 bool HasSimdModifier, bool HasMonotonic,
218 bool HasNonmonotonic, bool HasOrderedClause) {
219 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
220 OMPScheduleType::None &&
221 "Must not have monotonicity flags already set");
222 assert((!HasMonotonic || !HasNonmonotonic) &&
223 "Monotonic and Nonmonotonic are contradicting each other");
224
225 if (HasMonotonic) {
226 return ScheduleType | OMPScheduleType::ModifierMonotonic;
227 } else if (HasNonmonotonic) {
228 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
229 } else {
230 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
231 // If the static schedule kind is specified or if the ordered clause is
232 // specified, and if the nonmonotonic modifier is not specified, the
233 // effect is as if the monotonic modifier is specified. Otherwise, unless
234 // the monotonic modifier is specified, the effect is as if the
235 // nonmonotonic modifier is specified.
236 OMPScheduleType BaseScheduleType =
237 ScheduleType & ~OMPScheduleType::ModifierMask;
238 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
239 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
240 HasOrderedClause) {
241 // The monotonic is used by default in openmp runtime library, so no need
242 // to set it.
243 return ScheduleType;
244 } else {
245 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
246 }
247 }
248}
249
250/// Determine the schedule type using schedule and ordering clause arguments.
251static OMPScheduleType
252computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
253 bool HasSimdModifier, bool HasMonotonicModifier,
254 bool HasNonmonotonicModifier, bool HasOrderedClause) {
255 OMPScheduleType BaseSchedule =
256 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
257 OMPScheduleType OrderedSchedule =
258 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
260 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
261 HasNonmonotonicModifier, HasOrderedClause);
262
264 return Result;
265}
266
267/// Emit an implicit cast to convert \p XRead to type of variable \p V
269 llvm::Value *V) {
270 // TODO: Add this functionality to the `AtomicInfo` interface
271 llvm::Type *XReadType = XRead->getType();
272 llvm::Type *VType = V->getType();
273 if (llvm::AllocaInst *vAlloca = dyn_cast<llvm::AllocaInst>(V))
274 VType = vAlloca->getAllocatedType();
275
276 if (XReadType->isStructTy() && VType->isStructTy())
277 // No need to extract or convert. A direct
278 // `store` will suffice.
279 return XRead;
280
281 if (XReadType->isStructTy())
282 XRead = Builder.CreateExtractValue(XRead, /*Idxs=*/0);
283 if (VType->isIntegerTy() && XReadType->isFloatingPointTy())
284 XRead = Builder.CreateFPToSI(XRead, VType);
285 else if (VType->isFloatingPointTy() && XReadType->isIntegerTy())
286 XRead = Builder.CreateSIToFP(XRead, VType);
287 else if (VType->isIntegerTy() && XReadType->isIntegerTy())
288 XRead = Builder.CreateIntCast(XRead, VType, true);
289 else if (VType->isFloatingPointTy() && XReadType->isFloatingPointTy())
290 XRead = Builder.CreateFPCast(XRead, VType);
291 return XRead;
292}
293
294/// Make \p Source branch to \p Target.
295///
296/// Handles two situations:
297/// * \p Source already has an unconditional branch.
298/// * \p Source is a degenerate block (no terminator because the BB is
299/// the current head of the IR construction).
301 if (Instruction *Term = Source->getTerminator()) {
302 auto *Br = cast<BranchInst>(Term);
303 assert(!Br->isConditional() &&
304 "BB's terminator must be an unconditional branch (or degenerate)");
305 BasicBlock *Succ = Br->getSuccessor(0);
306 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
307 Br->setSuccessor(0, Target);
308 return;
309 }
310
311 auto *NewBr = BranchInst::Create(Target, Source);
312 NewBr->setDebugLoc(DL);
313}
314
316 bool CreateBranch) {
317 assert(New->getFirstInsertionPt() == New->begin() &&
318 "Target BB must not have PHI nodes");
319
320 // Move instructions to new block.
321 BasicBlock *Old = IP.getBlock();
322 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
323
324 if (CreateBranch)
325 BranchInst::Create(New, Old);
326}
327
328void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
330 BasicBlock *Old = Builder.GetInsertBlock();
331
332 spliceBB(Builder.saveIP(), New, CreateBranch);
333 if (CreateBranch)
334 Builder.SetInsertPoint(Old->getTerminator());
335 else
336 Builder.SetInsertPoint(Old);
337
338 // SetInsertPoint also updates the Builder's debug location, but we want to
339 // keep the one the Builder was configured to use.
341}
342
345 BasicBlock *Old = IP.getBlock();
347 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
348 Old->getParent(), Old->getNextNode());
349 spliceBB(IP, New, CreateBranch);
350 New->replaceSuccessorsPhiUsesWith(Old, New);
351 return New;
352}
353
354BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
357 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
358 if (CreateBranch)
359 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
360 else
361 Builder.SetInsertPoint(Builder.GetInsertBlock());
362 // SetInsertPoint also updates the Builder's debug location, but we want to
363 // keep the one the Builder was configured to use.
365 return New;
366}
367
368BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
371 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
372 if (CreateBranch)
373 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
374 else
375 Builder.SetInsertPoint(Builder.GetInsertBlock());
376 // SetInsertPoint also updates the Builder's debug location, but we want to
377 // keep the one the Builder was configured to use.
379 return New;
380}
381
383 llvm::Twine Suffix) {
384 BasicBlock *Old = Builder.GetInsertBlock();
385 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
386}
387
388// This function creates a fake integer value and a fake use for the integer
389// value. It returns the fake value created. This is useful in modeling the
390// extra arguments to the outlined functions.
392 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
394 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
395 const Twine &Name = "", bool AsPtr = true) {
396 Builder.restoreIP(OuterAllocaIP);
397 Instruction *FakeVal;
398 AllocaInst *FakeValAddr =
399 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
400 ToBeDeleted.push_back(FakeValAddr);
401
402 if (AsPtr) {
403 FakeVal = FakeValAddr;
404 } else {
405 FakeVal =
406 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
407 ToBeDeleted.push_back(FakeVal);
408 }
409
410 // Generate a fake use of this value
411 Builder.restoreIP(InnerAllocaIP);
412 Instruction *UseFakeVal;
413 if (AsPtr) {
414 UseFakeVal =
415 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
416 } else {
417 UseFakeVal =
418 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
419 }
420 ToBeDeleted.push_back(UseFakeVal);
421 return FakeVal;
422}
423
424//===----------------------------------------------------------------------===//
425// OpenMPIRBuilderConfig
426//===----------------------------------------------------------------------===//
427
428namespace {
430/// Values for bit flags for marking which requires clauses have been used.
431enum OpenMPOffloadingRequiresDirFlags {
432 /// flag undefined.
433 OMP_REQ_UNDEFINED = 0x000,
434 /// no requires directive present.
435 OMP_REQ_NONE = 0x001,
436 /// reverse_offload clause.
437 OMP_REQ_REVERSE_OFFLOAD = 0x002,
438 /// unified_address clause.
439 OMP_REQ_UNIFIED_ADDRESS = 0x004,
440 /// unified_shared_memory clause.
441 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
442 /// dynamic_allocators clause.
443 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
444 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
445};
446
447} // anonymous namespace
448
450 : RequiresFlags(OMP_REQ_UNDEFINED) {}
451
453 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
454 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
455 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
456 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
457 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
458 RequiresFlags(OMP_REQ_UNDEFINED) {
459 if (HasRequiresReverseOffload)
460 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
461 if (HasRequiresUnifiedAddress)
462 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
463 if (HasRequiresUnifiedSharedMemory)
464 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
465 if (HasRequiresDynamicAllocators)
466 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
467}
468
470 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
471}
472
474 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
475}
476
478 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
479}
480
482 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
483}
484
486 return hasRequiresFlags() ? RequiresFlags
487 : static_cast<int64_t>(OMP_REQ_NONE);
488}
489
491 if (Value)
492 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
493 else
494 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
495}
496
498 if (Value)
499 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
500 else
501 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
502}
503
505 if (Value)
506 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
507 else
508 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
509}
510
512 if (Value)
513 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
514 else
515 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
516}
517
518//===----------------------------------------------------------------------===//
519// OpenMPIRBuilder
520//===----------------------------------------------------------------------===//
521
523 IRBuilderBase &Builder,
524 SmallVector<Value *> &ArgsVector) {
526 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
527 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
528 constexpr const size_t MaxDim = 3;
529 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
530 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
531
532 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
533
534 Value *NumTeams3D =
535 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
536 Value *NumThreads3D =
537 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
538 for (unsigned I :
539 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
540 NumTeams3D =
541 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
542 for (unsigned I :
543 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
544 NumThreads3D =
545 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
546
547 ArgsVector = {Version,
548 PointerNum,
549 KernelArgs.RTArgs.BasePointersArray,
550 KernelArgs.RTArgs.PointersArray,
551 KernelArgs.RTArgs.SizesArray,
552 KernelArgs.RTArgs.MapTypesArray,
553 KernelArgs.RTArgs.MapNamesArray,
554 KernelArgs.RTArgs.MappersArray,
555 KernelArgs.NumIterations,
556 Flags,
557 NumTeams3D,
558 NumThreads3D,
559 KernelArgs.DynCGGroupMem};
560}
561
563 LLVMContext &Ctx = Fn.getContext();
564
565 // Get the function's current attributes.
566 auto Attrs = Fn.getAttributes();
567 auto FnAttrs = Attrs.getFnAttrs();
568 auto RetAttrs = Attrs.getRetAttrs();
570 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
571 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
572
573 // Add AS to FnAS while taking special care with integer extensions.
574 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
575 bool Param = true) -> void {
576 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
577 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
578 if (HasSignExt || HasZeroExt) {
579 assert(AS.getNumAttributes() == 1 &&
580 "Currently not handling extension attr combined with others.");
581 if (Param) {
582 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
583 FnAS = FnAS.addAttribute(Ctx, AK);
584 } else if (auto AK =
585 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
586 FnAS = FnAS.addAttribute(Ctx, AK);
587 } else {
588 FnAS = FnAS.addAttributes(Ctx, AS);
589 }
590 };
591
592#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
593#include "llvm/Frontend/OpenMP/OMPKinds.def"
594
595 // Add attributes to the function declaration.
596 switch (FnID) {
597#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
598 case Enum: \
599 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
600 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
601 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
602 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
603 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
604 break;
605#include "llvm/Frontend/OpenMP/OMPKinds.def"
606 default:
607 // Attributes are optional.
608 break;
609 }
610}
611
614 FunctionType *FnTy = nullptr;
615 Function *Fn = nullptr;
616
617 // Try to find the declation in the module first.
618 switch (FnID) {
619#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
620 case Enum: \
621 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
622 IsVarArg); \
623 Fn = M.getFunction(Str); \
624 break;
625#include "llvm/Frontend/OpenMP/OMPKinds.def"
626 }
627
628 if (!Fn) {
629 // Create a new declaration if we need one.
630 switch (FnID) {
631#define OMP_RTL(Enum, Str, ...) \
632 case Enum: \
633 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
634 break;
635#include "llvm/Frontend/OpenMP/OMPKinds.def"
636 }
637
638 // Add information if the runtime function takes a callback function
639 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
640 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
641 LLVMContext &Ctx = Fn->getContext();
642 MDBuilder MDB(Ctx);
643 // Annotate the callback behavior of the runtime function:
644 // - The callback callee is argument number 2 (microtask).
645 // - The first two arguments of the callback callee are unknown (-1).
646 // - All variadic arguments to the runtime function are passed to the
647 // callback callee.
648 Fn->addMetadata(
649 LLVMContext::MD_callback,
651 2, {-1, -1}, /* VarArgsArePassed */ true)}));
652 }
653 }
654
655 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
656 << " with type " << *Fn->getFunctionType() << "\n");
657 addAttributes(FnID, *Fn);
658
659 } else {
660 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
661 << " with type " << *Fn->getFunctionType() << "\n");
662 }
663
664 assert(Fn && "Failed to create OpenMP runtime function");
665
666 return {FnTy, Fn};
667}
668
671 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
672 assert(Fn && "Failed to create OpenMP runtime function pointer");
673 return Fn;
674}
675
676void OpenMPIRBuilder::initialize() { initializeTypes(M); }
677
680 BasicBlock &EntryBlock = Function->getEntryBlock();
681 Instruction *MoveLocInst = EntryBlock.getFirstNonPHI();
682
683 // Loop over blocks looking for constant allocas, skipping the entry block
684 // as any allocas there are already in the desired location.
685 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
686 Block++) {
687 for (auto Inst = Block->getReverseIterator()->begin();
688 Inst != Block->getReverseIterator()->end();) {
689 if (auto *AllocaInst = dyn_cast_if_present<llvm::AllocaInst>(Inst)) {
690 Inst++;
691 if (!isa<ConstantData>(AllocaInst->getArraySize()))
692 continue;
693 AllocaInst->moveBeforePreserving(MoveLocInst);
694 } else {
695 Inst++;
696 }
697 }
698 }
699}
700
702 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
704 SmallVector<OutlineInfo, 16> DeferredOutlines;
705 for (OutlineInfo &OI : OutlineInfos) {
706 // Skip functions that have not finalized yet; may happen with nested
707 // function generation.
708 if (Fn && OI.getFunction() != Fn) {
709 DeferredOutlines.push_back(OI);
710 continue;
711 }
712
713 ParallelRegionBlockSet.clear();
714 Blocks.clear();
715 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
716
717 Function *OuterFn = OI.getFunction();
718 CodeExtractorAnalysisCache CEAC(*OuterFn);
719 // If we generate code for the target device, we need to allocate
720 // struct for aggregate params in the device default alloca address space.
721 // OpenMP runtime requires that the params of the extracted functions are
722 // passed as zero address space pointers. This flag ensures that
723 // CodeExtractor generates correct code for extracted functions
724 // which are used by OpenMP runtime.
725 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
726 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
727 /* AggregateArgs */ true,
728 /* BlockFrequencyInfo */ nullptr,
729 /* BranchProbabilityInfo */ nullptr,
730 /* AssumptionCache */ nullptr,
731 /* AllowVarArgs */ true,
732 /* AllowAlloca */ true,
733 /* AllocaBlock*/ OI.OuterAllocaBB,
734 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
735
736 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
737 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
738 << " Exit: " << OI.ExitBB->getName() << "\n");
739 assert(Extractor.isEligible() &&
740 "Expected OpenMP outlining to be possible!");
741
742 for (auto *V : OI.ExcludeArgsFromAggregate)
743 Extractor.excludeArgFromAggregate(V);
744
745 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
746
747 // Forward target-cpu, target-features attributes to the outlined function.
748 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
749 if (TargetCpuAttr.isStringAttribute())
750 OutlinedFn->addFnAttr(TargetCpuAttr);
751
752 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
753 if (TargetFeaturesAttr.isStringAttribute())
754 OutlinedFn->addFnAttr(TargetFeaturesAttr);
755
756 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
757 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
758 assert(OutlinedFn->getReturnType()->isVoidTy() &&
759 "OpenMP outlined functions should not return a value!");
760
761 // For compability with the clang CG we move the outlined function after the
762 // one with the parallel region.
763 OutlinedFn->removeFromParent();
764 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
765
766 // Remove the artificial entry introduced by the extractor right away, we
767 // made our own entry block after all.
768 {
769 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
770 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
771 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
772 // Move instructions from the to-be-deleted ArtificialEntry to the entry
773 // basic block of the parallel region. CodeExtractor generates
774 // instructions to unwrap the aggregate argument and may sink
775 // allocas/bitcasts for values that are solely used in the outlined region
776 // and do not escape.
777 assert(!ArtificialEntry.empty() &&
778 "Expected instructions to add in the outlined region entry");
779 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
780 End = ArtificialEntry.rend();
781 It != End;) {
782 Instruction &I = *It;
783 It++;
784
785 if (I.isTerminator())
786 continue;
787
788 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
789 }
790
791 OI.EntryBB->moveBefore(&ArtificialEntry);
792 ArtificialEntry.eraseFromParent();
793 }
794 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
795 assert(OutlinedFn && OutlinedFn->getNumUses() == 1);
796
797 // Run a user callback, e.g. to add attributes.
798 if (OI.PostOutlineCB)
799 OI.PostOutlineCB(*OutlinedFn);
800 }
801
802 // Remove work items that have been completed.
803 OutlineInfos = std::move(DeferredOutlines);
804
805 // The createTarget functions embeds user written code into
806 // the target region which may inject allocas which need to
807 // be moved to the entry block of our target or risk malformed
808 // optimisations by later passes, this is only relevant for
809 // the device pass which appears to be a little more delicate
810 // when it comes to optimisations (however, we do not block on
811 // that here, it's up to the inserter to the list to do so).
812 // This notbaly has to occur after the OutlinedInfo candidates
813 // have been extracted so we have an end product that will not
814 // be implicitly adversely affected by any raises unless
815 // intentionally appended to the list.
816 // NOTE: This only does so for ConstantData, it could be extended
817 // to ConstantExpr's with further effort, however, they should
818 // largely be folded when they get here. Extending it to runtime
819 // defined/read+writeable allocation sizes would be non-trivial
820 // (need to factor in movement of any stores to variables the
821 // allocation size depends on, as well as the usual loads,
822 // otherwise it'll yield the wrong result after movement) and
823 // likely be more suitable as an LLVM optimisation pass.
826
827 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
828 [](EmitMetadataErrorKind Kind,
829 const TargetRegionEntryInfo &EntryInfo) -> void {
830 errs() << "Error of kind: " << Kind
831 << " when emitting offload entries and metadata during "
832 "OMPIRBuilder finalization \n";
833 };
834
837
838 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
839 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
840 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
841 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
842 }
843}
844
846 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
847}
848
851 auto *GV =
852 new GlobalVariable(M, I32Ty,
853 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
854 ConstantInt::get(I32Ty, Value), Name);
855 GV->setVisibility(GlobalValue::HiddenVisibility);
856
857 return GV;
858}
859
861 if (List.empty())
862 return;
863
864 // Convert List to what ConstantArray needs.
866 UsedArray.resize(List.size());
867 for (unsigned I = 0, E = List.size(); I != E; ++I)
869 cast<Constant>(&*List[I]), Builder.getPtrTy());
870
871 if (UsedArray.empty())
872 return;
873 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
874
875 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
876 ConstantArray::get(ATy, UsedArray), Name);
877
878 GV->setSection("llvm.metadata");
879}
880
883 OMPTgtExecModeFlags Mode) {
884 auto *Int8Ty = Builder.getInt8Ty();
885 auto *GVMode = new GlobalVariable(
886 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
887 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
888 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
889 return GVMode;
890}
891
893 uint32_t SrcLocStrSize,
894 IdentFlag LocFlags,
895 unsigned Reserve2Flags) {
896 // Enable "C-mode".
897 LocFlags |= OMP_IDENT_FLAG_KMPC;
898
899 Constant *&Ident =
900 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
901 if (!Ident) {
903 Constant *IdentData[] = {I32Null,
904 ConstantInt::get(Int32, uint32_t(LocFlags)),
905 ConstantInt::get(Int32, Reserve2Flags),
906 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
907 Constant *Initializer =
908 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
909
910 // Look for existing encoding of the location + flags, not needed but
911 // minimizes the difference to the existing solution while we transition.
912 for (GlobalVariable &GV : M.globals())
913 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
914 if (GV.getInitializer() == Initializer)
915 Ident = &GV;
916
917 if (!Ident) {
918 auto *GV = new GlobalVariable(
919 M, OpenMPIRBuilder::Ident,
920 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
923 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
924 GV->setAlignment(Align(8));
925 Ident = GV;
926 }
927 }
928
930}
931
933 uint32_t &SrcLocStrSize) {
934 SrcLocStrSize = LocStr.size();
935 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
936 if (!SrcLocStr) {
937 Constant *Initializer =
939
940 // Look for existing encoding of the location, not needed but minimizes the
941 // difference to the existing solution while we transition.
942 for (GlobalVariable &GV : M.globals())
943 if (GV.isConstant() && GV.hasInitializer() &&
944 GV.getInitializer() == Initializer)
945 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
946
947 SrcLocStr = Builder.CreateGlobalString(LocStr, /* Name */ "",
948 /* AddressSpace */ 0, &M);
949 }
950 return SrcLocStr;
951}
952
954 StringRef FileName,
955 unsigned Line, unsigned Column,
956 uint32_t &SrcLocStrSize) {
957 SmallString<128> Buffer;
958 Buffer.push_back(';');
959 Buffer.append(FileName);
960 Buffer.push_back(';');
961 Buffer.append(FunctionName);
962 Buffer.push_back(';');
963 Buffer.append(std::to_string(Line));
964 Buffer.push_back(';');
965 Buffer.append(std::to_string(Column));
966 Buffer.push_back(';');
967 Buffer.push_back(';');
968 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
969}
970
971Constant *
973 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
974 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
975}
976
978 uint32_t &SrcLocStrSize,
979 Function *F) {
980 DILocation *DIL = DL.get();
981 if (!DIL)
982 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
983 StringRef FileName = M.getName();
984 if (DIFile *DIF = DIL->getFile())
985 if (std::optional<StringRef> Source = DIF->getSource())
986 FileName = *Source;
987 StringRef Function = DIL->getScope()->getSubprogram()->getName();
988 if (Function.empty() && F)
989 Function = F->getName();
990 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
991 DIL->getColumn(), SrcLocStrSize);
992}
993
995 uint32_t &SrcLocStrSize) {
996 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
997 Loc.IP.getBlock()->getParent());
998}
999
1001 return Builder.CreateCall(
1002 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1003 "omp_global_thread_num");
1004}
1005
1008 bool ForceSimpleCall, bool CheckCancelFlag) {
1009 if (!updateToLocation(Loc))
1010 return Loc.IP;
1011
1012 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1013 // __kmpc_barrier(loc, thread_id);
1014
1015 IdentFlag BarrierLocFlags;
1016 switch (Kind) {
1017 case OMPD_for:
1018 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1019 break;
1020 case OMPD_sections:
1021 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1022 break;
1023 case OMPD_single:
1024 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1025 break;
1026 case OMPD_barrier:
1027 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1028 break;
1029 default:
1030 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1031 break;
1032 }
1033
1034 uint32_t SrcLocStrSize;
1035 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1036 Value *Args[] = {
1037 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1038 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1039
1040 // If we are in a cancellable parallel region, barriers are cancellation
1041 // points.
1042 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1043 bool UseCancelBarrier =
1044 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1045
1046 Value *Result =
1048 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
1049 : OMPRTL___kmpc_barrier),
1050 Args);
1051
1052 if (UseCancelBarrier && CheckCancelFlag)
1053 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1054 return Err;
1055
1056 return Builder.saveIP();
1057}
1058
1061 Value *IfCondition,
1062 omp::Directive CanceledDirective) {
1063 if (!updateToLocation(Loc))
1064 return Loc.IP;
1065
1066 // LLVM utilities like blocks with terminators.
1067 auto *UI = Builder.CreateUnreachable();
1068
1069 Instruction *ThenTI = UI, *ElseTI = nullptr;
1070 if (IfCondition)
1071 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1072 Builder.SetInsertPoint(ThenTI);
1073
1074 Value *CancelKind = nullptr;
1075 switch (CanceledDirective) {
1076#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1077 case DirectiveEnum: \
1078 CancelKind = Builder.getInt32(Value); \
1079 break;
1080#include "llvm/Frontend/OpenMP/OMPKinds.def"
1081 default:
1082 llvm_unreachable("Unknown cancel kind!");
1083 }
1084
1085 uint32_t SrcLocStrSize;
1086 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1087 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1088 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1089 Value *Result = Builder.CreateCall(
1090 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1091 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1092 if (CanceledDirective == OMPD_parallel) {
1094 Builder.restoreIP(IP);
1096 omp::Directive::OMPD_unknown,
1097 /* ForceSimpleCall */ false,
1098 /* CheckCancelFlag */ false)
1099 .takeError();
1100 }
1101 return Error::success();
1102 };
1103
1104 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1105 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1106 return Err;
1107
1108 // Update the insertion point and remove the terminator we introduced.
1109 Builder.SetInsertPoint(UI->getParent());
1110 UI->eraseFromParent();
1111
1112 return Builder.saveIP();
1113}
1114
1116 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1117 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1118 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1119 if (!updateToLocation(Loc))
1120 return Loc.IP;
1121
1122 Builder.restoreIP(AllocaIP);
1123 auto *KernelArgsPtr =
1124 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1125 Builder.restoreIP(Loc.IP);
1126
1127 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1128 llvm::Value *Arg =
1129 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1131 KernelArgs[I], Arg,
1132 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1133 }
1134
1135 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1136 NumThreads, HostPtr, KernelArgsPtr};
1137
1138 Return = Builder.CreateCall(
1139 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1140 OffloadingArgs);
1141
1142 return Builder.saveIP();
1143}
1144
1146 const LocationDescription &Loc, Value *OutlinedFnID,
1147 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1148 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1149
1150 if (!updateToLocation(Loc))
1151 return Loc.IP;
1152
1153 Builder.restoreIP(Loc.IP);
1154 // On top of the arrays that were filled up, the target offloading call
1155 // takes as arguments the device id as well as the host pointer. The host
1156 // pointer is used by the runtime library to identify the current target
1157 // region, so it only has to be unique and not necessarily point to
1158 // anything. It could be the pointer to the outlined function that
1159 // implements the target region, but we aren't using that so that the
1160 // compiler doesn't need to keep that, and could therefore inline the host
1161 // function if proven worthwhile during optimization.
1162
1163 // From this point on, we need to have an ID of the target region defined.
1164 assert(OutlinedFnID && "Invalid outlined function ID!");
1165 (void)OutlinedFnID;
1166
1167 // Return value of the runtime offloading call.
1168 Value *Return = nullptr;
1169
1170 // Arguments for the target kernel.
1171 SmallVector<Value *> ArgsVector;
1172 getKernelArgsVector(Args, Builder, ArgsVector);
1173
1174 // The target region is an outlined function launched by the runtime
1175 // via calls to __tgt_target_kernel().
1176 //
1177 // Note that on the host and CPU targets, the runtime implementation of
1178 // these calls simply call the outlined function without forking threads.
1179 // The outlined functions themselves have runtime calls to
1180 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1181 // the compiler in emitTeamsCall() and emitParallelCall().
1182 //
1183 // In contrast, on the NVPTX target, the implementation of
1184 // __tgt_target_teams() launches a GPU kernel with the requested number
1185 // of teams and threads so no additional calls to the runtime are required.
1186 // Check the error code and execute the host version if required.
1188 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1189 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1190
1191 BasicBlock *OffloadFailedBlock =
1192 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1193 BasicBlock *OffloadContBlock =
1194 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1196 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1197
1198 auto CurFn = Builder.GetInsertBlock()->getParent();
1199 emitBlock(OffloadFailedBlock, CurFn);
1200 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1201 if (!AfterIP)
1202 return AfterIP.takeError();
1203 Builder.restoreIP(*AfterIP);
1204 emitBranch(OffloadContBlock);
1205 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1206 return Builder.saveIP();
1207}
1208
1210 Value *CancelFlag, omp::Directive CanceledDirective,
1211 FinalizeCallbackTy ExitCB) {
1212 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1213 "Unexpected cancellation!");
1214
1215 // For a cancel barrier we create two new blocks.
1217 BasicBlock *NonCancellationBlock;
1218 if (Builder.GetInsertPoint() == BB->end()) {
1219 // TODO: This branch will not be needed once we moved to the
1220 // OpenMPIRBuilder codegen completely.
1221 NonCancellationBlock = BasicBlock::Create(
1222 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1223 } else {
1224 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1227 }
1228 BasicBlock *CancellationBlock = BasicBlock::Create(
1229 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1230
1231 // Jump to them based on the return value.
1232 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1233 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1234 /* TODO weight */ nullptr, nullptr);
1235
1236 // From the cancellation block we finalize all variables and go to the
1237 // post finalization block that is known to the FiniCB callback.
1238 Builder.SetInsertPoint(CancellationBlock);
1239 if (ExitCB)
1240 if (Error Err = ExitCB(Builder.saveIP()))
1241 return Err;
1242 auto &FI = FinalizationStack.back();
1243 if (Error Err = FI.FiniCB(Builder.saveIP()))
1244 return Err;
1245
1246 // The continuation block is where code generation continues.
1247 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1248 return Error::success();
1249}
1250
1251// Callback used to create OpenMP runtime calls to support
1252// omp parallel clause for the device.
1253// We need to use this callback to replace call to the OutlinedFn in OuterFn
1254// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1256 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1257 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1258 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1259 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1260 // Add some known attributes.
1261 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1262 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1263 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1264 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1265 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1266 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1267
1268 assert(OutlinedFn.arg_size() >= 2 &&
1269 "Expected at least tid and bounded tid as arguments");
1270 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1271
1272 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1273 assert(CI && "Expected call instruction to outlined function");
1274 CI->getParent()->setName("omp_parallel");
1275
1276 Builder.SetInsertPoint(CI);
1277 Type *PtrTy = OMPIRBuilder->VoidPtr;
1278 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1279
1280 // Add alloca for kernel args
1281 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1282 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1283 AllocaInst *ArgsAlloca =
1284 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1285 Value *Args = ArgsAlloca;
1286 // Add address space cast if array for storing arguments is not allocated
1287 // in address space 0
1288 if (ArgsAlloca->getAddressSpace())
1289 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1290 Builder.restoreIP(CurrentIP);
1291
1292 // Store captured vars which are used by kmpc_parallel_51
1293 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1294 Value *V = *(CI->arg_begin() + 2 + Idx);
1295 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1296 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1297 Builder.CreateStore(V, StoreAddress);
1298 }
1299
1300 Value *Cond =
1301 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1302 : Builder.getInt32(1);
1303
1304 // Build kmpc_parallel_51 call
1305 Value *Parallel51CallArgs[] = {
1306 /* identifier*/ Ident,
1307 /* global thread num*/ ThreadID,
1308 /* if expression */ Cond,
1309 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1310 /* Proc bind */ Builder.getInt32(-1),
1311 /* outlined function */
1312 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr),
1313 /* wrapper function */ NullPtrValue,
1314 /* arguments of the outlined funciton*/ Args,
1315 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1316
1317 FunctionCallee RTLFn =
1318 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1319
1320 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1321
1322 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1323 << *Builder.GetInsertBlock()->getParent() << "\n");
1324
1325 // Initialize the local TID stack location with the argument value.
1326 Builder.SetInsertPoint(PrivTID);
1327 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1328 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1329 PrivTIDAddr);
1330
1331 // Remove redundant call to the outlined function.
1332 CI->eraseFromParent();
1333
1334 for (Instruction *I : ToBeDeleted) {
1335 I->eraseFromParent();
1336 }
1337}
1338
1339// Callback used to create OpenMP runtime calls to support
1340// omp parallel clause for the host.
1341// We need to use this callback to replace call to the OutlinedFn in OuterFn
1342// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1343static void
1345 Function *OuterFn, Value *Ident, Value *IfCondition,
1346 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1347 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1348 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1349 FunctionCallee RTLFn;
1350 if (IfCondition) {
1351 RTLFn =
1352 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1353 } else {
1354 RTLFn =
1355 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1356 }
1357 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1358 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1359 LLVMContext &Ctx = F->getContext();
1360 MDBuilder MDB(Ctx);
1361 // Annotate the callback behavior of the __kmpc_fork_call:
1362 // - The callback callee is argument number 2 (microtask).
1363 // - The first two arguments of the callback callee are unknown (-1).
1364 // - All variadic arguments to the __kmpc_fork_call are passed to the
1365 // callback callee.
1366 F->addMetadata(LLVMContext::MD_callback,
1368 2, {-1, -1},
1369 /* VarArgsArePassed */ true)}));
1370 }
1371 }
1372 // Add some known attributes.
1373 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1374 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1375 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1376
1377 assert(OutlinedFn.arg_size() >= 2 &&
1378 "Expected at least tid and bounded tid as arguments");
1379 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1380
1381 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1382 CI->getParent()->setName("omp_parallel");
1383 Builder.SetInsertPoint(CI);
1384
1385 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1386 Value *ForkCallArgs[] = {
1387 Ident, Builder.getInt32(NumCapturedVars),
1388 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr)};
1389
1390 SmallVector<Value *, 16> RealArgs;
1391 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1392 if (IfCondition) {
1393 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1394 RealArgs.push_back(Cond);
1395 }
1396 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1397
1398 // __kmpc_fork_call_if always expects a void ptr as the last argument
1399 // If there are no arguments, pass a null pointer.
1400 auto PtrTy = OMPIRBuilder->VoidPtr;
1401 if (IfCondition && NumCapturedVars == 0) {
1402 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1403 RealArgs.push_back(NullPtrValue);
1404 }
1405 if (IfCondition && RealArgs.back()->getType() != PtrTy)
1406 RealArgs.back() = Builder.CreateBitCast(RealArgs.back(), PtrTy);
1407
1408 Builder.CreateCall(RTLFn, RealArgs);
1409
1410 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1411 << *Builder.GetInsertBlock()->getParent() << "\n");
1412
1413 // Initialize the local TID stack location with the argument value.
1414 Builder.SetInsertPoint(PrivTID);
1415 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1416 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1417 PrivTIDAddr);
1418
1419 // Remove redundant call to the outlined function.
1420 CI->eraseFromParent();
1421
1422 for (Instruction *I : ToBeDeleted) {
1423 I->eraseFromParent();
1424 }
1425}
1426
1428 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1429 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1430 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1431 omp::ProcBindKind ProcBind, bool IsCancellable) {
1432 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1433
1434 if (!updateToLocation(Loc))
1435 return Loc.IP;
1436
1437 uint32_t SrcLocStrSize;
1438 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1439 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1440 Value *ThreadID = getOrCreateThreadID(Ident);
1441 // If we generate code for the target device, we need to allocate
1442 // struct for aggregate params in the device default alloca address space.
1443 // OpenMP runtime requires that the params of the extracted functions are
1444 // passed as zero address space pointers. This flag ensures that extracted
1445 // function arguments are declared in zero address space
1446 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1447
1448 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1449 // only if we compile for host side.
1450 if (NumThreads && !Config.isTargetDevice()) {
1451 Value *Args[] = {
1452 Ident, ThreadID,
1453 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1455 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1456 }
1457
1458 if (ProcBind != OMP_PROC_BIND_default) {
1459 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1460 Value *Args[] = {
1461 Ident, ThreadID,
1462 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1464 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1465 }
1466
1467 BasicBlock *InsertBB = Builder.GetInsertBlock();
1468 Function *OuterFn = InsertBB->getParent();
1469
1470 // Save the outer alloca block because the insertion iterator may get
1471 // invalidated and we still need this later.
1472 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1473
1474 // Vector to remember instructions we used only during the modeling but which
1475 // we want to delete at the end.
1477
1478 // Change the location to the outer alloca insertion point to create and
1479 // initialize the allocas we pass into the parallel region.
1480 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1481 Builder.restoreIP(NewOuter);
1482 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1483 AllocaInst *ZeroAddrAlloca =
1484 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1485 Instruction *TIDAddr = TIDAddrAlloca;
1486 Instruction *ZeroAddr = ZeroAddrAlloca;
1487 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1488 // Add additional casts to enforce pointers in zero address space
1489 TIDAddr = new AddrSpaceCastInst(
1490 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1491 TIDAddr->insertAfter(TIDAddrAlloca);
1492 ToBeDeleted.push_back(TIDAddr);
1493 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1494 PointerType ::get(M.getContext(), 0),
1495 "zero.addr.ascast");
1496 ZeroAddr->insertAfter(ZeroAddrAlloca);
1497 ToBeDeleted.push_back(ZeroAddr);
1498 }
1499
1500 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1501 // associated arguments in the outlined function, so we delete them later.
1502 ToBeDeleted.push_back(TIDAddrAlloca);
1503 ToBeDeleted.push_back(ZeroAddrAlloca);
1504
1505 // Create an artificial insertion point that will also ensure the blocks we
1506 // are about to split are not degenerated.
1507 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1508
1509 BasicBlock *EntryBB = UI->getParent();
1510 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1511 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1512 BasicBlock *PRegPreFiniBB =
1513 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1514 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1515
1516 auto FiniCBWrapper = [&](InsertPointTy IP) {
1517 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1518 // target to the region exit block.
1519 if (IP.getBlock()->end() == IP.getPoint()) {
1521 Builder.restoreIP(IP);
1522 Instruction *I = Builder.CreateBr(PRegExitBB);
1523 IP = InsertPointTy(I->getParent(), I->getIterator());
1524 }
1525 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1526 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1527 "Unexpected insertion point for finalization call!");
1528 return FiniCB(IP);
1529 };
1530
1531 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1532
1533 // Generate the privatization allocas in the block that will become the entry
1534 // of the outlined function.
1535 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1536 InsertPointTy InnerAllocaIP = Builder.saveIP();
1537
1538 AllocaInst *PrivTIDAddr =
1539 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1540 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1541
1542 // Add some fake uses for OpenMP provided arguments.
1543 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1544 Instruction *ZeroAddrUse =
1545 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1546 ToBeDeleted.push_back(ZeroAddrUse);
1547
1548 // EntryBB
1549 // |
1550 // V
1551 // PRegionEntryBB <- Privatization allocas are placed here.
1552 // |
1553 // V
1554 // PRegionBodyBB <- BodeGen is invoked here.
1555 // |
1556 // V
1557 // PRegPreFiniBB <- The block we will start finalization from.
1558 // |
1559 // V
1560 // PRegionExitBB <- A common exit to simplify block collection.
1561 //
1562
1563 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1564
1565 // Let the caller create the body.
1566 assert(BodyGenCB && "Expected body generation callback!");
1567 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1568 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1569 return Err;
1570
1571 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1572
1573 OutlineInfo OI;
1574 if (Config.isTargetDevice()) {
1575 // Generate OpenMP target specific runtime call
1576 OI.PostOutlineCB = [=, ToBeDeletedVec =
1577 std::move(ToBeDeleted)](Function &OutlinedFn) {
1578 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1579 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1580 ThreadID, ToBeDeletedVec);
1581 };
1582 } else {
1583 // Generate OpenMP host runtime call
1584 OI.PostOutlineCB = [=, ToBeDeletedVec =
1585 std::move(ToBeDeleted)](Function &OutlinedFn) {
1586 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1587 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1588 };
1589 }
1590
1591 OI.OuterAllocaBB = OuterAllocaBlock;
1592 OI.EntryBB = PRegEntryBB;
1593 OI.ExitBB = PRegExitBB;
1594
1595 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1597 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1598
1599 // Ensure a single exit node for the outlined region by creating one.
1600 // We might have multiple incoming edges to the exit now due to finalizations,
1601 // e.g., cancel calls that cause the control flow to leave the region.
1602 BasicBlock *PRegOutlinedExitBB = PRegExitBB;
1603 PRegExitBB = SplitBlock(PRegExitBB, &*PRegExitBB->getFirstInsertionPt());
1604 PRegOutlinedExitBB->setName("omp.par.outlined.exit");
1605 Blocks.push_back(PRegOutlinedExitBB);
1606
1607 CodeExtractorAnalysisCache CEAC(*OuterFn);
1608 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1609 /* AggregateArgs */ false,
1610 /* BlockFrequencyInfo */ nullptr,
1611 /* BranchProbabilityInfo */ nullptr,
1612 /* AssumptionCache */ nullptr,
1613 /* AllowVarArgs */ true,
1614 /* AllowAlloca */ true,
1615 /* AllocationBlock */ OuterAllocaBlock,
1616 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1617
1618 // Find inputs to, outputs from the code region.
1619 BasicBlock *CommonExit = nullptr;
1620 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1621 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1622
1623 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1624 /*CollectGlobalInputs=*/true);
1625
1626 Inputs.remove_if([&](Value *I) {
1627 if (auto *GV = dyn_cast_if_present<GlobalVariable>(I))
1628 return GV->getValueType() == OpenMPIRBuilder::Ident;
1629
1630 return false;
1631 });
1632
1633 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1634
1635 FunctionCallee TIDRTLFn =
1636 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1637
1638 auto PrivHelper = [&](Value &V) -> Error {
1639 if (&V == TIDAddr || &V == ZeroAddr) {
1640 OI.ExcludeArgsFromAggregate.push_back(&V);
1641 return Error::success();
1642 }
1643
1645 for (Use &U : V.uses())
1646 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1647 if (ParallelRegionBlockSet.count(UserI->getParent()))
1648 Uses.insert(&U);
1649
1650 // __kmpc_fork_call expects extra arguments as pointers. If the input
1651 // already has a pointer type, everything is fine. Otherwise, store the
1652 // value onto stack and load it back inside the to-be-outlined region. This
1653 // will ensure only the pointer will be passed to the function.
1654 // FIXME: if there are more than 15 trailing arguments, they must be
1655 // additionally packed in a struct.
1656 Value *Inner = &V;
1657 if (!V.getType()->isPointerTy()) {
1659 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1660
1661 Builder.restoreIP(OuterAllocaIP);
1662 Value *Ptr =
1663 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1664
1665 // Store to stack at end of the block that currently branches to the entry
1666 // block of the to-be-outlined region.
1667 Builder.SetInsertPoint(InsertBB,
1668 InsertBB->getTerminator()->getIterator());
1669 Builder.CreateStore(&V, Ptr);
1670
1671 // Load back next to allocations in the to-be-outlined region.
1672 Builder.restoreIP(InnerAllocaIP);
1673 Inner = Builder.CreateLoad(V.getType(), Ptr);
1674 }
1675
1676 Value *ReplacementValue = nullptr;
1677 CallInst *CI = dyn_cast<CallInst>(&V);
1678 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1679 ReplacementValue = PrivTID;
1680 } else {
1681 InsertPointOrErrorTy AfterIP =
1682 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1683 if (!AfterIP)
1684 return AfterIP.takeError();
1685 Builder.restoreIP(*AfterIP);
1686 InnerAllocaIP = {
1687 InnerAllocaIP.getBlock(),
1688 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1689
1690 assert(ReplacementValue &&
1691 "Expected copy/create callback to set replacement value!");
1692 if (ReplacementValue == &V)
1693 return Error::success();
1694 }
1695
1696 for (Use *UPtr : Uses)
1697 UPtr->set(ReplacementValue);
1698
1699 return Error::success();
1700 };
1701
1702 // Reset the inner alloca insertion as it will be used for loading the values
1703 // wrapped into pointers before passing them into the to-be-outlined region.
1704 // Configure it to insert immediately after the fake use of zero address so
1705 // that they are available in the generated body and so that the
1706 // OpenMP-related values (thread ID and zero address pointers) remain leading
1707 // in the argument list.
1708 InnerAllocaIP = IRBuilder<>::InsertPoint(
1709 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1710
1711 // Reset the outer alloca insertion point to the entry of the relevant block
1712 // in case it was invalidated.
1713 OuterAllocaIP = IRBuilder<>::InsertPoint(
1714 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1715
1716 for (Value *Input : Inputs) {
1717 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1718 if (Error Err = PrivHelper(*Input))
1719 return Err;
1720 }
1721 LLVM_DEBUG({
1722 for (Value *Output : Outputs)
1723 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1724 });
1725 assert(Outputs.empty() &&
1726 "OpenMP outlining should not produce live-out values!");
1727
1728 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1729 LLVM_DEBUG({
1730 for (auto *BB : Blocks)
1731 dbgs() << " PBR: " << BB->getName() << "\n";
1732 });
1733
1734 // Adjust the finalization stack, verify the adjustment, and call the
1735 // finalize function a last time to finalize values between the pre-fini
1736 // block and the exit block if we left the parallel "the normal way".
1737 auto FiniInfo = FinalizationStack.pop_back_val();
1738 (void)FiniInfo;
1739 assert(FiniInfo.DK == OMPD_parallel &&
1740 "Unexpected finalization stack state!");
1741
1742 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1743
1744 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1745 if (Error Err = FiniCB(PreFiniIP))
1746 return Err;
1747
1748 // Register the outlined info.
1749 addOutlineInfo(std::move(OI));
1750
1751 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1752 UI->eraseFromParent();
1753
1754 return AfterIP;
1755}
1756
1758 // Build call void __kmpc_flush(ident_t *loc)
1759 uint32_t SrcLocStrSize;
1760 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1761 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1762
1763 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1764}
1765
1767 if (!updateToLocation(Loc))
1768 return;
1769 emitFlush(Loc);
1770}
1771
1773 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1774 // global_tid);
1775 uint32_t SrcLocStrSize;
1776 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1777 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1778 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1779
1780 // Ignore return result until untied tasks are supported.
1781 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1782 Args);
1783}
1784
1786 if (!updateToLocation(Loc))
1787 return;
1788 emitTaskwaitImpl(Loc);
1789}
1790
1792 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1793 uint32_t SrcLocStrSize;
1794 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1795 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1797 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1798
1799 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1800 Args);
1801}
1802
1804 if (!updateToLocation(Loc))
1805 return;
1806 emitTaskyieldImpl(Loc);
1807}
1808
1809// Processes the dependencies in Dependencies and does the following
1810// - Allocates space on the stack of an array of DependInfo objects
1811// - Populates each DependInfo object with relevant information of
1812// the corresponding dependence.
1813// - All code is inserted in the entry block of the current function.
1815 OpenMPIRBuilder &OMPBuilder,
1817 // Early return if we have no dependencies to process
1818 if (Dependencies.empty())
1819 return nullptr;
1820
1821 // Given a vector of DependData objects, in this function we create an
1822 // array on the stack that holds kmp_dep_info objects corresponding
1823 // to each dependency. This is then passed to the OpenMP runtime.
1824 // For example, if there are 'n' dependencies then the following psedo
1825 // code is generated. Assume the first dependence is on a variable 'a'
1826 //
1827 // \code{c}
1828 // DepArray = alloc(n x sizeof(kmp_depend_info);
1829 // idx = 0;
1830 // DepArray[idx].base_addr = ptrtoint(&a);
1831 // DepArray[idx].len = 8;
1832 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1833 // ++idx;
1834 // DepArray[idx].base_addr = ...;
1835 // \endcode
1836
1837 IRBuilderBase &Builder = OMPBuilder.Builder;
1838 Type *DependInfo = OMPBuilder.DependInfo;
1839 Module &M = OMPBuilder.M;
1840
1841 Value *DepArray = nullptr;
1842 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1843 Builder.SetInsertPoint(
1845
1846 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1847 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1848
1849 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
1850 Value *Base =
1851 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
1852 // Store the pointer to the variable
1853 Value *Addr = Builder.CreateStructGEP(
1854 DependInfo, Base,
1855 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1856 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1857 Builder.CreateStore(DepValPtr, Addr);
1858 // Store the size of the variable
1859 Value *Size = Builder.CreateStructGEP(
1860 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
1861 Builder.CreateStore(
1862 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1863 Size);
1864 // Store the dependency kind
1865 Value *Flags = Builder.CreateStructGEP(
1866 DependInfo, Base,
1867 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1868 Builder.CreateStore(
1869 ConstantInt::get(Builder.getInt8Ty(),
1870 static_cast<unsigned int>(Dep.DepKind)),
1871 Flags);
1872 }
1873 Builder.restoreIP(OldIP);
1874 return DepArray;
1875}
1876
1878 const LocationDescription &Loc, InsertPointTy AllocaIP,
1879 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
1880 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle,
1881 Value *Priority) {
1882
1883 if (!updateToLocation(Loc))
1884 return InsertPointTy();
1885
1886 uint32_t SrcLocStrSize;
1887 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1888 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1889 // The current basic block is split into four basic blocks. After outlining,
1890 // they will be mapped as follows:
1891 // ```
1892 // def current_fn() {
1893 // current_basic_block:
1894 // br label %task.exit
1895 // task.exit:
1896 // ; instructions after task
1897 // }
1898 // def outlined_fn() {
1899 // task.alloca:
1900 // br label %task.body
1901 // task.body:
1902 // ret void
1903 // }
1904 // ```
1905 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1906 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1907 BasicBlock *TaskAllocaBB =
1908 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1909
1910 InsertPointTy TaskAllocaIP =
1911 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1912 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1913 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
1914 return Err;
1915
1916 OutlineInfo OI;
1917 OI.EntryBB = TaskAllocaBB;
1918 OI.OuterAllocaBB = AllocaIP.getBlock();
1919 OI.ExitBB = TaskExitBB;
1920
1921 // Add the thread ID argument.
1924 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1925
1926 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1927 Mergeable, Priority, EventHandle, TaskAllocaBB,
1928 ToBeDeleted](Function &OutlinedFn) mutable {
1929 // Replace the Stale CI by appropriate RTL function call.
1930 assert(OutlinedFn.getNumUses() == 1 &&
1931 "there must be a single user for the outlined function");
1932 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1933
1934 // HasShareds is true if any variables are captured in the outlined region,
1935 // false otherwise.
1936 bool HasShareds = StaleCI->arg_size() > 1;
1937 Builder.SetInsertPoint(StaleCI);
1938
1939 // Gather the arguments for emitting the runtime call for
1940 // @__kmpc_omp_task_alloc
1941 Function *TaskAllocFn =
1942 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
1943
1944 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
1945 // call.
1946 Value *ThreadID = getOrCreateThreadID(Ident);
1947
1948 // Argument - `flags`
1949 // Task is tied iff (Flags & 1) == 1.
1950 // Task is untied iff (Flags & 1) == 0.
1951 // Task is final iff (Flags & 2) == 2.
1952 // Task is not final iff (Flags & 2) == 0.
1953 // Task is mergeable iff (Flags & 4) == 4.
1954 // Task is not mergeable iff (Flags & 4) == 0.
1955 // Task is priority iff (Flags & 32) == 32.
1956 // Task is not priority iff (Flags & 32) == 0.
1957 // TODO: Handle the other flags.
1958 Value *Flags = Builder.getInt32(Tied);
1959 if (Final) {
1960 Value *FinalFlag =
1962 Flags = Builder.CreateOr(FinalFlag, Flags);
1963 }
1964
1965 if (Mergeable)
1967 if (Priority)
1969
1970 // Argument - `sizeof_kmp_task_t` (TaskSize)
1971 // Tasksize refers to the size in bytes of kmp_task_t data structure
1972 // including private vars accessed in task.
1973 // TODO: add kmp_task_t_with_privates (privates)
1974 Value *TaskSize = Builder.getInt64(
1976
1977 // Argument - `sizeof_shareds` (SharedsSize)
1978 // SharedsSize refers to the shareds array size in the kmp_task_t data
1979 // structure.
1980 Value *SharedsSize = Builder.getInt64(0);
1981 if (HasShareds) {
1982 AllocaInst *ArgStructAlloca =
1983 dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
1984 assert(ArgStructAlloca &&
1985 "Unable to find the alloca instruction corresponding to arguments "
1986 "for extracted function");
1987 StructType *ArgStructType =
1988 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
1989 assert(ArgStructType && "Unable to find struct type corresponding to "
1990 "arguments for extracted function");
1991 SharedsSize =
1993 }
1994 // Emit the @__kmpc_omp_task_alloc runtime call
1995 // The runtime call returns a pointer to an area where the task captured
1996 // variables must be copied before the task is run (TaskData)
1997 CallInst *TaskData = Builder.CreateCall(
1998 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
1999 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2000 /*task_func=*/&OutlinedFn});
2001
2002 // Emit detach clause initialization.
2003 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2004 // task_descriptor);
2005 if (EventHandle) {
2007 OMPRTL___kmpc_task_allow_completion_event);
2008 llvm::Value *EventVal =
2009 Builder.CreateCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2010 llvm::Value *EventHandleAddr =
2012 Builder.getPtrTy(0));
2013 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2014 Builder.CreateStore(EventVal, EventHandleAddr);
2015 }
2016 // Copy the arguments for outlined function
2017 if (HasShareds) {
2018 Value *Shareds = StaleCI->getArgOperand(1);
2019 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2020 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2021 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2022 SharedsSize);
2023 }
2024
2025 if (Priority) {
2026 //
2027 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2028 // we populate the priority information into the "kmp_task_t" here
2029 //
2030 // The struct "kmp_task_t" definition is available in kmp.h
2031 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2032 // data2 is used for priority
2033 //
2034 Type *Int32Ty = Builder.getInt32Ty();
2035 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2036 // kmp_task_t* => { ptr }
2037 Type *TaskPtr = StructType::get(VoidPtr);
2038 Value *TaskGEP =
2039 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2040 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2041 Type *TaskStructType = StructType::get(
2042 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2043 Value *PriorityData = Builder.CreateInBoundsGEP(
2044 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2045 // kmp_cmplrdata_t => { ptr, ptr }
2046 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2047 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2048 PriorityData, {Zero, Zero});
2049 Builder.CreateStore(Priority, CmplrData);
2050 }
2051
2052 Value *DepArray = nullptr;
2053 if (Dependencies.size()) {
2054 InsertPointTy OldIP = Builder.saveIP();
2056 &OldIP.getBlock()->getParent()->getEntryBlock().back());
2057
2058 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
2059 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
2060
2061 unsigned P = 0;
2062 for (const DependData &Dep : Dependencies) {
2063 Value *Base =
2064 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, P);
2065 // Store the pointer to the variable
2067 DependInfo, Base,
2068 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
2069 Value *DepValPtr =
2071 Builder.CreateStore(DepValPtr, Addr);
2072 // Store the size of the variable
2074 DependInfo, Base,
2075 static_cast<unsigned int>(RTLDependInfoFields::Len));
2077 Dep.DepValueType)),
2078 Size);
2079 // Store the dependency kind
2081 DependInfo, Base,
2082 static_cast<unsigned int>(RTLDependInfoFields::Flags));
2084 ConstantInt::get(Builder.getInt8Ty(),
2085 static_cast<unsigned int>(Dep.DepKind)),
2086 Flags);
2087 ++P;
2088 }
2089
2090 Builder.restoreIP(OldIP);
2091 }
2092
2093 // In the presence of the `if` clause, the following IR is generated:
2094 // ...
2095 // %data = call @__kmpc_omp_task_alloc(...)
2096 // br i1 %if_condition, label %then, label %else
2097 // then:
2098 // call @__kmpc_omp_task(...)
2099 // br label %exit
2100 // else:
2101 // ;; Wait for resolution of dependencies, if any, before
2102 // ;; beginning the task
2103 // call @__kmpc_omp_wait_deps(...)
2104 // call @__kmpc_omp_task_begin_if0(...)
2105 // call @outlined_fn(...)
2106 // call @__kmpc_omp_task_complete_if0(...)
2107 // br label %exit
2108 // exit:
2109 // ...
2110 if (IfCondition) {
2111 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2112 // terminator.
2113 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2114 Instruction *IfTerminator =
2115 Builder.GetInsertPoint()->getParent()->getTerminator();
2116 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2117 Builder.SetInsertPoint(IfTerminator);
2118 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2119 &ElseTI);
2120 Builder.SetInsertPoint(ElseTI);
2121
2122 if (Dependencies.size()) {
2123 Function *TaskWaitFn =
2124 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2126 TaskWaitFn,
2127 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2128 ConstantInt::get(Builder.getInt32Ty(), 0),
2130 }
2131 Function *TaskBeginFn =
2132 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2133 Function *TaskCompleteFn =
2134 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2135 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2136 CallInst *CI = nullptr;
2137 if (HasShareds)
2138 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
2139 else
2140 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
2141 CI->setDebugLoc(StaleCI->getDebugLoc());
2142 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2143 Builder.SetInsertPoint(ThenTI);
2144 }
2145
2146 if (Dependencies.size()) {
2147 Function *TaskFn =
2148 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2150 TaskFn,
2151 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2152 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2154
2155 } else {
2156 // Emit the @__kmpc_omp_task runtime call to spawn the task
2157 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2158 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
2159 }
2160
2161 StaleCI->eraseFromParent();
2162
2163 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2164 if (HasShareds) {
2165 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2166 OutlinedFn.getArg(1)->replaceUsesWithIf(
2167 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2168 }
2169
2170 for (Instruction *I : llvm::reverse(ToBeDeleted))
2171 I->eraseFromParent();
2172 };
2173
2174 addOutlineInfo(std::move(OI));
2175 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2176
2177 return Builder.saveIP();
2178}
2179
2182 InsertPointTy AllocaIP,
2183 BodyGenCallbackTy BodyGenCB) {
2184 if (!updateToLocation(Loc))
2185 return InsertPointTy();
2186
2187 uint32_t SrcLocStrSize;
2188 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2189 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2190 Value *ThreadID = getOrCreateThreadID(Ident);
2191
2192 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2193 Function *TaskgroupFn =
2194 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2195 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2196
2197 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2198 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2199 return Err;
2200
2201 Builder.SetInsertPoint(TaskgroupExitBB);
2202 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2203 Function *EndTaskgroupFn =
2204 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2205 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2206
2207 return Builder.saveIP();
2208}
2209
2211 const LocationDescription &Loc, InsertPointTy AllocaIP,
2213 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2214 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2215
2216 if (!updateToLocation(Loc))
2217 return Loc.IP;
2218
2219 auto FiniCBWrapper = [&](InsertPointTy IP) {
2220 if (IP.getBlock()->end() != IP.getPoint())
2221 return FiniCB(IP);
2222 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2223 // will fail because that function requires the Finalization Basic Block to
2224 // have a terminator, which is already removed by EmitOMPRegionBody.
2225 // IP is currently at cancelation block.
2226 // We need to backtrack to the condition block to fetch
2227 // the exit block and create a branch from cancelation
2228 // to exit block.
2230 Builder.restoreIP(IP);
2231 auto *CaseBB = IP.getBlock()->getSinglePredecessor();
2232 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2233 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2234 Instruction *I = Builder.CreateBr(ExitBB);
2235 IP = InsertPointTy(I->getParent(), I->getIterator());
2236 return FiniCB(IP);
2237 };
2238
2239 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2240
2241 // Each section is emitted as a switch case
2242 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2243 // -> OMP.createSection() which generates the IR for each section
2244 // Iterate through all sections and emit a switch construct:
2245 // switch (IV) {
2246 // case 0:
2247 // <SectionStmt[0]>;
2248 // break;
2249 // ...
2250 // case <NumSection> - 1:
2251 // <SectionStmt[<NumSection> - 1]>;
2252 // break;
2253 // }
2254 // ...
2255 // section_loop.after:
2256 // <FiniCB>;
2257 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2258 Builder.restoreIP(CodeGenIP);
2260 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2261 Function *CurFn = Continue->getParent();
2262 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2263
2264 unsigned CaseNumber = 0;
2265 for (auto SectionCB : SectionCBs) {
2267 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2268 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2269 Builder.SetInsertPoint(CaseBB);
2270 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2271 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2272 CaseEndBr->getIterator()}))
2273 return Err;
2274 CaseNumber++;
2275 }
2276 // remove the existing terminator from body BB since there can be no
2277 // terminators after switch/case
2278 return Error::success();
2279 };
2280 // Loop body ends here
2281 // LowerBound, UpperBound, and STride for createCanonicalLoop
2282 Type *I32Ty = Type::getInt32Ty(M.getContext());
2283 Value *LB = ConstantInt::get(I32Ty, 0);
2284 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2285 Value *ST = ConstantInt::get(I32Ty, 1);
2287 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2288 if (!LoopInfo)
2289 return LoopInfo.takeError();
2290
2291 InsertPointOrErrorTy WsloopIP =
2292 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP, !IsNowait);
2293 if (!WsloopIP)
2294 return WsloopIP.takeError();
2295 InsertPointTy AfterIP = *WsloopIP;
2296
2297 // Apply the finalization callback in LoopAfterBB
2298 auto FiniInfo = FinalizationStack.pop_back_val();
2299 assert(FiniInfo.DK == OMPD_sections &&
2300 "Unexpected finalization stack state!");
2301 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2302 Builder.restoreIP(AfterIP);
2303 BasicBlock *FiniBB =
2304 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2305 if (Error Err = CB(Builder.saveIP()))
2306 return Err;
2307 AfterIP = {FiniBB, FiniBB->begin()};
2308 }
2309
2310 return AfterIP;
2311}
2312
2315 BodyGenCallbackTy BodyGenCB,
2316 FinalizeCallbackTy FiniCB) {
2317 if (!updateToLocation(Loc))
2318 return Loc.IP;
2319
2320 auto FiniCBWrapper = [&](InsertPointTy IP) {
2321 if (IP.getBlock()->end() != IP.getPoint())
2322 return FiniCB(IP);
2323 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2324 // will fail because that function requires the Finalization Basic Block to
2325 // have a terminator, which is already removed by EmitOMPRegionBody.
2326 // IP is currently at cancelation block.
2327 // We need to backtrack to the condition block to fetch
2328 // the exit block and create a branch from cancelation
2329 // to exit block.
2331 Builder.restoreIP(IP);
2332 auto *CaseBB = Loc.IP.getBlock();
2333 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2334 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2335 Instruction *I = Builder.CreateBr(ExitBB);
2336 IP = InsertPointTy(I->getParent(), I->getIterator());
2337 return FiniCB(IP);
2338 };
2339
2340 Directive OMPD = Directive::OMPD_sections;
2341 // Since we are using Finalization Callback here, HasFinalize
2342 // and IsCancellable have to be true
2343 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2344 /*Conditional*/ false, /*hasFinalize*/ true,
2345 /*IsCancellable*/ true);
2346}
2347
2350 IT++;
2351 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2352}
2353
2354Value *OpenMPIRBuilder::getGPUThreadID() {
2355 return Builder.CreateCall(
2357 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2358 {});
2359}
2360
2361Value *OpenMPIRBuilder::getGPUWarpSize() {
2362 return Builder.CreateCall(
2363 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2364}
2365
2366Value *OpenMPIRBuilder::getNVPTXWarpID() {
2367 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2368 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2369}
2370
2371Value *OpenMPIRBuilder::getNVPTXLaneID() {
2372 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2373 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2374 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2375 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2376 "nvptx_lane_id");
2377}
2378
2379Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2380 Type *ToType) {
2381 Type *FromType = From->getType();
2382 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2383 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2384 assert(FromSize > 0 && "From size must be greater than zero");
2385 assert(ToSize > 0 && "To size must be greater than zero");
2386 if (FromType == ToType)
2387 return From;
2388 if (FromSize == ToSize)
2389 return Builder.CreateBitCast(From, ToType);
2390 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2391 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2392 InsertPointTy SaveIP = Builder.saveIP();
2393 Builder.restoreIP(AllocaIP);
2394 Value *CastItem = Builder.CreateAlloca(ToType);
2395 Builder.restoreIP(SaveIP);
2396
2398 CastItem, Builder.getPtrTy(0));
2399 Builder.CreateStore(From, ValCastItem);
2400 return Builder.CreateLoad(ToType, CastItem);
2401}
2402
2403Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2404 Value *Element,
2405 Type *ElementType,
2406 Value *Offset) {
2407 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2408 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2409
2410 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2411 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2412 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2413 Value *WarpSize =
2414 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2416 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2417 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2418 Value *WarpSizeCast =
2419 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2420 Value *ShuffleCall =
2421 Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2422 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2423}
2424
2425void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2426 Value *DstAddr, Type *ElemType,
2427 Value *Offset, Type *ReductionArrayTy) {
2429 // Create the loop over the big sized data.
2430 // ptr = (void*)Elem;
2431 // ptrEnd = (void*) Elem + 1;
2432 // Step = 8;
2433 // while (ptr + Step < ptrEnd)
2434 // shuffle((int64_t)*ptr);
2435 // Step = 4;
2436 // while (ptr + Step < ptrEnd)
2437 // shuffle((int32_t)*ptr);
2438 // ...
2439 Type *IndexTy = Builder.getIndexTy(
2441 Value *ElemPtr = DstAddr;
2442 Value *Ptr = SrcAddr;
2443 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2444 if (Size < IntSize)
2445 continue;
2446 Type *IntType = Builder.getIntNTy(IntSize * 8);
2448 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2449 Value *SrcAddrGEP =
2450 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2452 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2453
2454 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2455 if ((Size / IntSize) > 1) {
2457 SrcAddrGEP, Builder.getPtrTy());
2458 BasicBlock *PreCondBB =
2459 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2460 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2461 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2462 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2463 emitBlock(PreCondBB, CurFunc);
2464 PHINode *PhiSrc =
2465 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2466 PhiSrc->addIncoming(Ptr, CurrentBB);
2467 PHINode *PhiDest =
2468 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2469 PhiDest->addIncoming(ElemPtr, CurrentBB);
2470 Ptr = PhiSrc;
2471 ElemPtr = PhiDest;
2472 Value *PtrDiff = Builder.CreatePtrDiff(
2473 Builder.getInt8Ty(), PtrEnd,
2476 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2477 ExitBB);
2478 emitBlock(ThenBB, CurFunc);
2479 Value *Res = createRuntimeShuffleFunction(
2480 AllocaIP,
2482 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2483 IntType, Offset);
2484 Builder.CreateAlignedStore(Res, ElemPtr,
2485 M.getDataLayout().getPrefTypeAlign(ElemType));
2486 Value *LocalPtr =
2487 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2488 Value *LocalElemPtr =
2489 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2490 PhiSrc->addIncoming(LocalPtr, ThenBB);
2491 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2492 emitBranch(PreCondBB);
2493 emitBlock(ExitBB, CurFunc);
2494 } else {
2495 Value *Res = createRuntimeShuffleFunction(
2496 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2497 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2498 Res->getType()->getScalarSizeInBits())
2499 Res = Builder.CreateTrunc(Res, ElemType);
2500 Builder.CreateStore(Res, ElemPtr);
2501 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2502 ElemPtr =
2503 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2504 }
2505 Size = Size % IntSize;
2506 }
2507}
2508
2509void OpenMPIRBuilder::emitReductionListCopy(
2510 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2511 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2512 CopyOptionsTy CopyOptions) {
2513 Type *IndexTy = Builder.getIndexTy(
2515 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2516
2517 // Iterates, element-by-element, through the source Reduce list and
2518 // make a copy.
2519 for (auto En : enumerate(ReductionInfos)) {
2520 const ReductionInfo &RI = En.value();
2521 Value *SrcElementAddr = nullptr;
2522 Value *DestElementAddr = nullptr;
2523 Value *DestElementPtrAddr = nullptr;
2524 // Should we shuffle in an element from a remote lane?
2525 bool ShuffleInElement = false;
2526 // Set to true to update the pointer in the dest Reduce list to a
2527 // newly created element.
2528 bool UpdateDestListPtr = false;
2529
2530 // Step 1.1: Get the address for the src element in the Reduce list.
2531 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2532 ReductionArrayTy, SrcBase,
2533 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2534 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2535
2536 // Step 1.2: Create a temporary to store the element in the destination
2537 // Reduce list.
2538 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2539 ReductionArrayTy, DestBase,
2540 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2541 switch (Action) {
2543 InsertPointTy CurIP = Builder.saveIP();
2544 Builder.restoreIP(AllocaIP);
2545 AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
2546 ".omp.reduction.element");
2547 DestAlloca->setAlignment(
2548 M.getDataLayout().getPrefTypeAlign(RI.ElementType));
2549 DestElementAddr = DestAlloca;
2550 DestElementAddr =
2551 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2552 DestElementAddr->getName() + ".ascast");
2553 Builder.restoreIP(CurIP);
2554 ShuffleInElement = true;
2555 UpdateDestListPtr = true;
2556 break;
2557 }
2559 DestElementAddr =
2560 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2561 break;
2562 }
2563 }
2564
2565 // Now that all active lanes have read the element in the
2566 // Reduce list, shuffle over the value from the remote lane.
2567 if (ShuffleInElement) {
2568 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
2569 RemoteLaneOffset, ReductionArrayTy);
2570 } else {
2571 switch (RI.EvaluationKind) {
2572 case EvalKind::Scalar: {
2573 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2574 // Store the source element value to the dest element address.
2575 Builder.CreateStore(Elem, DestElementAddr);
2576 break;
2577 }
2578 case EvalKind::Complex: {
2580 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2581 Value *SrcReal = Builder.CreateLoad(
2582 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2584 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2585 Value *SrcImg = Builder.CreateLoad(
2586 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2587
2589 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2591 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2592 Builder.CreateStore(SrcReal, DestRealPtr);
2593 Builder.CreateStore(SrcImg, DestImgPtr);
2594 break;
2595 }
2596 case EvalKind::Aggregate: {
2597 Value *SizeVal = Builder.getInt64(
2598 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2600 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2601 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2602 SizeVal, false);
2603 break;
2604 }
2605 };
2606 }
2607
2608 // Step 3.1: Modify reference in dest Reduce list as needed.
2609 // Modifying the reference in Reduce list to point to the newly
2610 // created element. The element is live in the current function
2611 // scope and that of functions it invokes (i.e., reduce_function).
2612 // RemoteReduceData[i] = (void*)&RemoteElem
2613 if (UpdateDestListPtr) {
2615 DestElementAddr, Builder.getPtrTy(),
2616 DestElementAddr->getName() + ".ascast");
2617 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2618 }
2619 }
2620}
2621
2622Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
2623 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2624 AttributeList FuncAttrs) {
2625 InsertPointTy SavedIP = Builder.saveIP();
2626 LLVMContext &Ctx = M.getContext();
2628 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2629 /* IsVarArg */ false);
2630 Function *WcFunc =
2632 "_omp_reduction_inter_warp_copy_func", &M);
2633 WcFunc->setAttributes(FuncAttrs);
2634 WcFunc->addParamAttr(0, Attribute::NoUndef);
2635 WcFunc->addParamAttr(1, Attribute::NoUndef);
2636 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2637 Builder.SetInsertPoint(EntryBB);
2638
2639 // ReduceList: thread local Reduce list.
2640 // At the stage of the computation when this function is called, partially
2641 // aggregated values reside in the first lane of every active warp.
2642 Argument *ReduceListArg = WcFunc->getArg(0);
2643 // NumWarps: number of warps active in the parallel region. This could
2644 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2645 Argument *NumWarpsArg = WcFunc->getArg(1);
2646
2647 // This array is used as a medium to transfer, one reduce element at a time,
2648 // the data from the first lane of every warp to lanes in the first warp
2649 // in order to perform the final step of a reduction in a parallel region
2650 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2651 // for reduced latency, as well as to have a distinct copy for concurrently
2652 // executing target regions. The array is declared with common linkage so
2653 // as to be shared across compilation units.
2654 StringRef TransferMediumName =
2655 "__openmp_nvptx_data_transfer_temporary_storage";
2656 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2657 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2658 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2659 if (!TransferMedium) {
2660 TransferMedium = new GlobalVariable(
2661 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2662 UndefValue::get(ArrayTy), TransferMediumName,
2663 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2664 /*AddressSpace=*/3);
2665 }
2666
2667 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2668 Value *GPUThreadID = getGPUThreadID();
2669 // nvptx_lane_id = nvptx_id % warpsize
2670 Value *LaneID = getNVPTXLaneID();
2671 // nvptx_warp_id = nvptx_id / warpsize
2672 Value *WarpID = getNVPTXWarpID();
2673
2674 InsertPointTy AllocaIP =
2677 Type *Arg0Type = ReduceListArg->getType();
2678 Type *Arg1Type = NumWarpsArg->getType();
2679 Builder.restoreIP(AllocaIP);
2680 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2681 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2682 AllocaInst *NumWarpsAlloca =
2683 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2685 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2687 NumWarpsAlloca, Builder.getPtrTy(0),
2688 NumWarpsAlloca->getName() + ".ascast");
2689 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2690 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2691 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2692 InsertPointTy CodeGenIP =
2694 Builder.restoreIP(CodeGenIP);
2695
2696 Value *ReduceList =
2697 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2698
2699 for (auto En : enumerate(ReductionInfos)) {
2700 //
2701 // Warp master copies reduce element to transfer medium in __shared__
2702 // memory.
2703 //
2704 const ReductionInfo &RI = En.value();
2705 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
2706 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2707 Type *CType = Builder.getIntNTy(TySize * 8);
2708
2709 unsigned NumIters = RealTySize / TySize;
2710 if (NumIters == 0)
2711 continue;
2712 Value *Cnt = nullptr;
2713 Value *CntAddr = nullptr;
2714 BasicBlock *PrecondBB = nullptr;
2715 BasicBlock *ExitBB = nullptr;
2716 if (NumIters > 1) {
2717 CodeGenIP = Builder.saveIP();
2718 Builder.restoreIP(AllocaIP);
2719 CntAddr =
2720 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2721
2722 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2723 CntAddr->getName() + ".ascast");
2724 Builder.restoreIP(CodeGenIP);
2726 CntAddr,
2727 /*Volatile=*/false);
2728 PrecondBB = BasicBlock::Create(Ctx, "precond");
2729 ExitBB = BasicBlock::Create(Ctx, "exit");
2730 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2731 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2732 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2733 /*Volatile=*/false);
2735 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2736 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2738 }
2739
2740 // kmpc_barrier.
2741 InsertPointOrErrorTy BarrierIP1 =
2742 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2743 omp::Directive::OMPD_unknown,
2744 /* ForceSimpleCall */ false,
2745 /* CheckCancelFlag */ true);
2746 if (!BarrierIP1)
2747 return BarrierIP1.takeError();
2748 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2749 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2750 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2751
2752 // if (lane_id == 0)
2753 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2754 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2756
2757 // Reduce element = LocalReduceList[i]
2758 auto *RedListArrayTy =
2759 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2760 Type *IndexTy = Builder.getIndexTy(
2762 Value *ElemPtrPtr =
2763 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2764 {ConstantInt::get(IndexTy, 0),
2765 ConstantInt::get(IndexTy, En.index())});
2766 // elemptr = ((CopyType*)(elemptrptr)) + I
2767 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2768 if (NumIters > 1)
2769 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2770
2771 // Get pointer to location in transfer medium.
2772 // MediumPtr = &medium[warp_id]
2773 Value *MediumPtr = Builder.CreateInBoundsGEP(
2774 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2775 // elem = *elemptr
2776 //*MediumPtr = elem
2777 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2778 // Store the source element value to the dest element address.
2779 Builder.CreateStore(Elem, MediumPtr,
2780 /*IsVolatile*/ true);
2781 Builder.CreateBr(MergeBB);
2782
2783 // else
2785 Builder.CreateBr(MergeBB);
2786
2787 // endif
2789 InsertPointOrErrorTy BarrierIP2 =
2790 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2791 omp::Directive::OMPD_unknown,
2792 /* ForceSimpleCall */ false,
2793 /* CheckCancelFlag */ true);
2794 if (!BarrierIP2)
2795 return BarrierIP2.takeError();
2796
2797 // Warp 0 copies reduce element from transfer medium
2798 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2799 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2800 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2801
2802 Value *NumWarpsVal =
2803 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2804 // Up to 32 threads in warp 0 are active.
2805 Value *IsActiveThread =
2806 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2807 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2808
2809 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2810
2811 // SecMediumPtr = &medium[tid]
2812 // SrcMediumVal = *SrcMediumPtr
2813 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2814 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2815 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2816 Value *TargetElemPtrPtr =
2817 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2818 {ConstantInt::get(IndexTy, 0),
2819 ConstantInt::get(IndexTy, En.index())});
2820 Value *TargetElemPtrVal =
2821 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2822 Value *TargetElemPtr = TargetElemPtrVal;
2823 if (NumIters > 1)
2824 TargetElemPtr =
2825 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2826
2827 // *TargetElemPtr = SrcMediumVal;
2828 Value *SrcMediumValue =
2829 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2830 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2831 Builder.CreateBr(W0MergeBB);
2832
2833 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2834 Builder.CreateBr(W0MergeBB);
2835
2836 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2837
2838 if (NumIters > 1) {
2839 Cnt = Builder.CreateNSWAdd(
2840 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2841 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2842
2843 auto *CurFn = Builder.GetInsertBlock()->getParent();
2844 emitBranch(PrecondBB);
2845 emitBlock(ExitBB, CurFn);
2846 }
2847 RealTySize %= TySize;
2848 }
2849 }
2850
2852 Builder.restoreIP(SavedIP);
2853
2854 return WcFunc;
2855}
2856
2857Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
2858 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2859 AttributeList FuncAttrs) {
2860 LLVMContext &Ctx = M.getContext();
2861 FunctionType *FuncTy =
2863 {Builder.getPtrTy(), Builder.getInt16Ty(),
2864 Builder.getInt16Ty(), Builder.getInt16Ty()},
2865 /* IsVarArg */ false);
2866 Function *SarFunc =
2868 "_omp_reduction_shuffle_and_reduce_func", &M);
2869 SarFunc->setAttributes(FuncAttrs);
2870 SarFunc->addParamAttr(0, Attribute::NoUndef);
2871 SarFunc->addParamAttr(1, Attribute::NoUndef);
2872 SarFunc->addParamAttr(2, Attribute::NoUndef);
2873 SarFunc->addParamAttr(3, Attribute::NoUndef);
2874 SarFunc->addParamAttr(1, Attribute::SExt);
2875 SarFunc->addParamAttr(2, Attribute::SExt);
2876 SarFunc->addParamAttr(3, Attribute::SExt);
2877 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
2878 Builder.SetInsertPoint(EntryBB);
2879
2880 // Thread local Reduce list used to host the values of data to be reduced.
2881 Argument *ReduceListArg = SarFunc->getArg(0);
2882 // Current lane id; could be logical.
2883 Argument *LaneIDArg = SarFunc->getArg(1);
2884 // Offset of the remote source lane relative to the current lane.
2885 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
2886 // Algorithm version. This is expected to be known at compile time.
2887 Argument *AlgoVerArg = SarFunc->getArg(3);
2888
2889 Type *ReduceListArgType = ReduceListArg->getType();
2890 Type *LaneIDArgType = LaneIDArg->getType();
2891 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
2892 Value *ReduceListAlloca = Builder.CreateAlloca(
2893 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
2894 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2895 LaneIDArg->getName() + ".addr");
2896 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
2897 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
2898 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2899 AlgoVerArg->getName() + ".addr");
2900 ArrayType *RedListArrayTy =
2901 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2902
2903 // Create a local thread-private variable to host the Reduce list
2904 // from a remote lane.
2905 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
2906 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
2907
2909 ReduceListAlloca, ReduceListArgType,
2910 ReduceListAlloca->getName() + ".ascast");
2912 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
2913 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2914 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
2915 RemoteLaneOffsetAlloca->getName() + ".ascast");
2917 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
2919 RemoteReductionListAlloca, Builder.getPtrTy(),
2920 RemoteReductionListAlloca->getName() + ".ascast");
2921
2922 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2923 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
2924 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
2925 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
2926
2927 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
2928 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
2929 Value *RemoteLaneOffset =
2930 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
2931 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
2932
2933 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
2934
2935 // This loop iterates through the list of reduce elements and copies,
2936 // element by element, from a remote lane in the warp to RemoteReduceList,
2937 // hosted on the thread's stack.
2938 emitReductionListCopy(
2939 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
2940 ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
2941
2942 // The actions to be performed on the Remote Reduce list is dependent
2943 // on the algorithm version.
2944 //
2945 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2946 // LaneId % 2 == 0 && Offset > 0):
2947 // do the reduction value aggregation
2948 //
2949 // The thread local variable Reduce list is mutated in place to host the
2950 // reduced data, which is the aggregated value produced from local and
2951 // remote lanes.
2952 //
2953 // Note that AlgoVer is expected to be a constant integer known at compile
2954 // time.
2955 // When AlgoVer==0, the first conjunction evaluates to true, making
2956 // the entire predicate true during compile time.
2957 // When AlgoVer==1, the second conjunction has only the second part to be
2958 // evaluated during runtime. Other conjunctions evaluates to false
2959 // during compile time.
2960 // When AlgoVer==2, the third conjunction has only the second part to be
2961 // evaluated during runtime. Other conjunctions evaluates to false
2962 // during compile time.
2963 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
2964 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2965 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
2966 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
2967 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
2968 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
2969 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
2970 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
2971 Value *RemoteOffsetComp =
2972 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
2973 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
2974 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
2975 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
2976
2977 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2978 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2979 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2980
2981 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
2984 ReduceList, Builder.getPtrTy());
2985 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2986 RemoteListAddrCast, Builder.getPtrTy());
2987 Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
2988 ->addFnAttr(Attribute::NoUnwind);
2989 Builder.CreateBr(MergeBB);
2990
2992 Builder.CreateBr(MergeBB);
2993
2995
2996 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2997 // Reduce list.
2998 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2999 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3000 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3001
3002 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3003 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3004 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3005 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3006
3007 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3008 emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
3009 ReductionInfos, RemoteListAddrCast, ReduceList);
3010 Builder.CreateBr(CpyMergeBB);
3011
3012 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3013 Builder.CreateBr(CpyMergeBB);
3014
3015 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3016
3018
3019 return SarFunc;
3020}
3021
3022Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
3023 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3024 AttributeList FuncAttrs) {
3026 LLVMContext &Ctx = M.getContext();
3029 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3030 /* IsVarArg */ false);
3031 Function *LtGCFunc =
3033 "_omp_reduction_list_to_global_copy_func", &M);
3034 LtGCFunc->setAttributes(FuncAttrs);
3035 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3036 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3037 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3038
3039 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3040 Builder.SetInsertPoint(EntryBlock);
3041
3042 // Buffer: global reduction buffer.
3043 Argument *BufferArg = LtGCFunc->getArg(0);
3044 // Idx: index of the buffer.
3045 Argument *IdxArg = LtGCFunc->getArg(1);
3046 // ReduceList: thread local Reduce list.
3047 Argument *ReduceListArg = LtGCFunc->getArg(2);
3048
3049 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3050 BufferArg->getName() + ".addr");
3051 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3052 IdxArg->getName() + ".addr");
3053 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3054 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3056 BufferArgAlloca, Builder.getPtrTy(),
3057 BufferArgAlloca->getName() + ".ascast");
3059 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3060 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3061 ReduceListArgAlloca, Builder.getPtrTy(),
3062 ReduceListArgAlloca->getName() + ".ascast");
3063
3064 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3065 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3066 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3067
3068 Value *LocalReduceList =
3069 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3070 Value *BufferArgVal =
3071 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3072 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3073 Type *IndexTy = Builder.getIndexTy(
3075 for (auto En : enumerate(ReductionInfos)) {
3076 const ReductionInfo &RI = En.value();
3077 auto *RedListArrayTy =
3078 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3079 // Reduce element = LocalReduceList[i]
3080 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3081 RedListArrayTy, LocalReduceList,
3082 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3083 // elemptr = ((CopyType*)(elemptrptr)) + I
3084 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3085
3086 // Global = Buffer.VD[Idx];
3087 Value *BufferVD =
3088 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3090 ReductionsBufferTy, BufferVD, 0, En.index());
3091
3092 switch (RI.EvaluationKind) {
3093 case EvalKind::Scalar: {
3094 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3095 Builder.CreateStore(TargetElement, GlobVal);
3096 break;
3097 }
3098 case EvalKind::Complex: {
3100 RI.ElementType, ElemPtr, 0, 0, ".realp");
3101 Value *SrcReal = Builder.CreateLoad(
3102 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3104 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3105 Value *SrcImg = Builder.CreateLoad(
3106 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3107
3109 RI.ElementType, GlobVal, 0, 0, ".realp");
3111 RI.ElementType, GlobVal, 0, 1, ".imagp");
3112 Builder.CreateStore(SrcReal, DestRealPtr);
3113 Builder.CreateStore(SrcImg, DestImgPtr);
3114 break;
3115 }
3116 case EvalKind::Aggregate: {
3117 Value *SizeVal =
3118 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3120 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3121 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3122 break;
3123 }
3124 }
3125 }
3126
3128 Builder.restoreIP(OldIP);
3129 return LtGCFunc;
3130}
3131
3132Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
3133 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3134 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3136 LLVMContext &Ctx = M.getContext();
3139 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3140 /* IsVarArg */ false);
3141 Function *LtGRFunc =
3143 "_omp_reduction_list_to_global_reduce_func", &M);
3144 LtGRFunc->setAttributes(FuncAttrs);
3145 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3146 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3147 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3148
3149 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3150 Builder.SetInsertPoint(EntryBlock);
3151
3152 // Buffer: global reduction buffer.
3153 Argument *BufferArg = LtGRFunc->getArg(0);
3154 // Idx: index of the buffer.
3155 Argument *IdxArg = LtGRFunc->getArg(1);
3156 // ReduceList: thread local Reduce list.
3157 Argument *ReduceListArg = LtGRFunc->getArg(2);
3158
3159 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3160 BufferArg->getName() + ".addr");
3161 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3162 IdxArg->getName() + ".addr");
3163 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3164 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3165 auto *RedListArrayTy =
3166 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3167
3168 // 1. Build a list of reduction variables.
3169 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3170 Value *LocalReduceList =
3171 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3172
3174 BufferArgAlloca, Builder.getPtrTy(),
3175 BufferArgAlloca->getName() + ".ascast");
3177 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3178 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3179 ReduceListArgAlloca, Builder.getPtrTy(),
3180 ReduceListArgAlloca->getName() + ".ascast");
3181 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3182 LocalReduceList, Builder.getPtrTy(),
3183 LocalReduceList->getName() + ".ascast");
3184
3185 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3186 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3187 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3188
3189 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3190 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3191 Type *IndexTy = Builder.getIndexTy(
3193 for (auto En : enumerate(ReductionInfos)) {
3194 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3195 RedListArrayTy, LocalReduceListAddrCast,
3196 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3197 Value *BufferVD =
3198 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3199 // Global = Buffer.VD[Idx];
3201 ReductionsBufferTy, BufferVD, 0, En.index());
3202 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3203 }
3204
3205 // Call reduce_function(GlobalReduceList, ReduceList)
3206 Value *ReduceList =
3207 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3208 Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3209 ->addFnAttr(Attribute::NoUnwind);
3211 Builder.restoreIP(OldIP);
3212 return LtGRFunc;
3213}
3214
3215Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3216 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3217 AttributeList FuncAttrs) {
3219 LLVMContext &Ctx = M.getContext();
3222 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3223 /* IsVarArg */ false);
3224 Function *LtGCFunc =
3226 "_omp_reduction_global_to_list_copy_func", &M);
3227 LtGCFunc->setAttributes(FuncAttrs);
3228 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3229 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3230 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3231
3232 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3233 Builder.SetInsertPoint(EntryBlock);
3234
3235 // Buffer: global reduction buffer.
3236 Argument *BufferArg = LtGCFunc->getArg(0);
3237 // Idx: index of the buffer.
3238 Argument *IdxArg = LtGCFunc->getArg(1);
3239 // ReduceList: thread local Reduce list.
3240 Argument *ReduceListArg = LtGCFunc->getArg(2);
3241
3242 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3243 BufferArg->getName() + ".addr");
3244 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3245 IdxArg->getName() + ".addr");
3246 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3247 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3249 BufferArgAlloca, Builder.getPtrTy(),
3250 BufferArgAlloca->getName() + ".ascast");
3252 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3253 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3254 ReduceListArgAlloca, Builder.getPtrTy(),
3255 ReduceListArgAlloca->getName() + ".ascast");
3256 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3257 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3258 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3259
3260 Value *LocalReduceList =
3261 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3262 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3263 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3264 Type *IndexTy = Builder.getIndexTy(
3266 for (auto En : enumerate(ReductionInfos)) {
3267 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3268 auto *RedListArrayTy =
3269 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3270 // Reduce element = LocalReduceList[i]
3271 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3272 RedListArrayTy, LocalReduceList,
3273 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3274 // elemptr = ((CopyType*)(elemptrptr)) + I
3275 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3276 // Global = Buffer.VD[Idx];
3277 Value *BufferVD =
3278 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3280 ReductionsBufferTy, BufferVD, 0, En.index());
3281
3282 switch (RI.EvaluationKind) {
3283 case EvalKind::Scalar: {
3284 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3285 Builder.CreateStore(TargetElement, ElemPtr);
3286 break;
3287 }
3288 case EvalKind::Complex: {
3290 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3291 Value *SrcReal = Builder.CreateLoad(
3292 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3294 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3295 Value *SrcImg = Builder.CreateLoad(
3296 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3297
3299 RI.ElementType, ElemPtr, 0, 0, ".realp");
3301 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3302 Builder.CreateStore(SrcReal, DestRealPtr);
3303 Builder.CreateStore(SrcImg, DestImgPtr);
3304 break;
3305 }
3306 case EvalKind::Aggregate: {
3307 Value *SizeVal =
3311 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3312 SizeVal, false);
3313 break;
3314 }
3315 }
3316 }
3317
3319 Builder.restoreIP(OldIP);
3320 return LtGCFunc;
3321}
3322
3323Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3324 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3325 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3327 LLVMContext &Ctx = M.getContext();
3328 auto *FuncTy = FunctionType::get(
3330 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3331 /* IsVarArg */ false);
3332 Function *LtGRFunc =
3334 "_omp_reduction_global_to_list_reduce_func", &M);
3335 LtGRFunc->setAttributes(FuncAttrs);
3336 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3337 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3338 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3339
3340 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3341 Builder.SetInsertPoint(EntryBlock);
3342
3343 // Buffer: global reduction buffer.
3344 Argument *BufferArg = LtGRFunc->getArg(0);
3345 // Idx: index of the buffer.
3346 Argument *IdxArg = LtGRFunc->getArg(1);
3347 // ReduceList: thread local Reduce list.
3348 Argument *ReduceListArg = LtGRFunc->getArg(2);
3349
3350 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3351 BufferArg->getName() + ".addr");
3352 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3353 IdxArg->getName() + ".addr");
3354 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3355 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3356 ArrayType *RedListArrayTy =
3357 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3358
3359 // 1. Build a list of reduction variables.
3360 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3361 Value *LocalReduceList =
3362 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3363
3365 BufferArgAlloca, Builder.getPtrTy(),
3366 BufferArgAlloca->getName() + ".ascast");
3368 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3369 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3370 ReduceListArgAlloca, Builder.getPtrTy(),
3371 ReduceListArgAlloca->getName() + ".ascast");
3373 LocalReduceList, Builder.getPtrTy(),
3374 LocalReduceList->getName() + ".ascast");
3375
3376 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3377 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3378 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3379
3380 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3381 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3382 Type *IndexTy = Builder.getIndexTy(
3384 for (auto En : enumerate(ReductionInfos)) {
3385 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3386 RedListArrayTy, ReductionList,
3387 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3388 // Global = Buffer.VD[Idx];
3389 Value *BufferVD =
3390 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3392 ReductionsBufferTy, BufferVD, 0, En.index());
3393 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3394 }
3395
3396 // Call reduce_function(ReduceList, GlobalReduceList)
3397 Value *ReduceList =
3398 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3399 Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
3400 ->addFnAttr(Attribute::NoUnwind);
3402 Builder.restoreIP(OldIP);
3403 return LtGRFunc;
3404}
3405
3406std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3407 std::string Suffix =
3408 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3409 return (Name + Suffix).str();
3410}
3411
3412Expected<Function *> OpenMPIRBuilder::createReductionFunction(
3413 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3414 ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
3415 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3416 {Builder.getPtrTy(), Builder.getPtrTy()},
3417 /* IsVarArg */ false);
3418 std::string Name = getReductionFuncName(ReducerName);
3419 Function *ReductionFunc =
3421 ReductionFunc->setAttributes(FuncAttrs);
3422 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3423 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3424 BasicBlock *EntryBB =
3425 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3426 Builder.SetInsertPoint(EntryBB);
3427
3428 // Need to alloca memory here and deal with the pointers before getting
3429 // LHS/RHS pointers out
3430 Value *LHSArrayPtr = nullptr;
3431 Value *RHSArrayPtr = nullptr;
3432 Argument *Arg0 = ReductionFunc->getArg(0);
3433 Argument *Arg1 = ReductionFunc->getArg(1);
3434 Type *Arg0Type = Arg0->getType();
3435 Type *Arg1Type = Arg1->getType();
3436
3437 Value *LHSAlloca =
3438 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3439 Value *RHSAlloca =
3440 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3442 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3444 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3445 Builder.CreateStore(Arg0, LHSAddrCast);
3446 Builder.CreateStore(Arg1, RHSAddrCast);
3447 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3448 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3449
3450 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3451 Type *IndexTy = Builder.getIndexTy(
3453 SmallVector<Value *> LHSPtrs, RHSPtrs;
3454 for (auto En : enumerate(ReductionInfos)) {
3455 const ReductionInfo &RI = En.value();
3456 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3457 RedArrayTy, RHSArrayPtr,
3458 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3459 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3461 RHSI8Ptr, RI.PrivateVariable->getType(),
3462 RHSI8Ptr->getName() + ".ascast");
3463
3464 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3465 RedArrayTy, LHSArrayPtr,
3466 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3467 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3469 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3470
3472 LHSPtrs.emplace_back(LHSPtr);
3473 RHSPtrs.emplace_back(RHSPtr);
3474 } else {
3475 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3476 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3477 Value *Reduced;
3478 InsertPointOrErrorTy AfterIP =
3479 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3480 if (!AfterIP)
3481 return AfterIP.takeError();
3482 if (!Builder.GetInsertBlock())
3483 return ReductionFunc;
3484 Builder.CreateStore(Reduced, LHSPtr);
3485 }
3486 }
3487
3489 for (auto En : enumerate(ReductionInfos)) {
3490 unsigned Index = En.index();
3491 const ReductionInfo &RI = En.value();
3492 Value *LHSFixupPtr, *RHSFixupPtr;
3493 Builder.restoreIP(RI.ReductionGenClang(
3494 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3495
3496 // Fix the CallBack code genereated to use the correct Values for the LHS
3497 // and RHS
3498 LHSFixupPtr->replaceUsesWithIf(
3499 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3500 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3501 ReductionFunc;
3502 });
3503 RHSFixupPtr->replaceUsesWithIf(
3504 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3505 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3506 ReductionFunc;
3507 });
3508 }
3509
3511 return ReductionFunc;
3512}
3513
3514static void
3516 bool IsGPU) {
3517 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3518 (void)RI;
3519 assert(RI.Variable && "expected non-null variable");
3520 assert(RI.PrivateVariable && "expected non-null private variable");
3521 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3522 "expected non-null reduction generator callback");
3523 if (!IsGPU) {
3524 assert(
3525 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3526 "expected variables and their private equivalents to have the same "
3527 "type");
3528 }
3529 assert(RI.Variable->getType()->isPointerTy() &&
3530 "expected variables to be pointers");
3531 }
3532}
3533
3535 const LocationDescription &Loc, InsertPointTy AllocaIP,
3536 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3537 bool IsNoWait, bool IsTeamsReduction, bool HasDistribute,
3538 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
3539 unsigned ReductionBufNum, Value *SrcLocInfo) {
3540 if (!updateToLocation(Loc))
3541 return InsertPointTy();
3542 Builder.restoreIP(CodeGenIP);
3543 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3544 LLVMContext &Ctx = M.getContext();
3545
3546 // Source location for the ident struct
3547 if (!SrcLocInfo) {
3548 uint32_t SrcLocStrSize;
3549 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3550 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3551 }
3552
3553 if (ReductionInfos.size() == 0)
3554 return Builder.saveIP();
3555
3556 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3557 AttributeList FuncAttrs;
3558 AttrBuilder AttrBldr(Ctx);
3559 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3560 AttrBldr.addAttribute(Attr);
3561 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3562 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3563
3564 CodeGenIP = Builder.saveIP();
3565 Expected<Function *> ReductionResult =
3566 createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
3567 ReductionInfos, ReductionGenCBKind, FuncAttrs);
3568 if (!ReductionResult)
3569 return ReductionResult.takeError();
3570 Function *ReductionFunc = *ReductionResult;
3571 Builder.restoreIP(CodeGenIP);
3572
3573 // Set the grid value in the config needed for lowering later on
3574 if (GridValue.has_value())
3575 Config.setGridValue(GridValue.value());
3576 else
3577 Config.setGridValue(getGridValue(T, ReductionFunc));
3578
3579 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3580 // RedList, shuffle_reduce_func, interwarp_copy_func);
3581 // or
3582 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3583 Value *Res;
3584
3585 // 1. Build a list of reduction variables.
3586 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3587 auto Size = ReductionInfos.size();
3588 Type *PtrTy = PointerType::getUnqual(Ctx);
3589 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3590 CodeGenIP = Builder.saveIP();
3591 Builder.restoreIP(AllocaIP);
3592 Value *ReductionListAlloca =
3593 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3595 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3596 Builder.restoreIP(CodeGenIP);
3597 Type *IndexTy = Builder.getIndexTy(
3599 for (auto En : enumerate(ReductionInfos)) {
3600 const ReductionInfo &RI = En.value();
3601 Value *ElemPtr = Builder.CreateInBoundsGEP(
3602 RedArrayTy, ReductionList,
3603 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3604 Value *CastElem =
3606 Builder.CreateStore(CastElem, ElemPtr);
3607 }
3608 CodeGenIP = Builder.saveIP();
3609 Function *SarFunc =
3610 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
3611 Expected<Function *> CopyResult =
3612 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
3613 if (!CopyResult)
3614 return CopyResult.takeError();
3615 Function *WcFunc = *CopyResult;
3616 Builder.restoreIP(CodeGenIP);
3617
3618 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3619
3620 unsigned MaxDataSize = 0;
3621 SmallVector<Type *> ReductionTypeArgs;
3622 for (auto En : enumerate(ReductionInfos)) {
3623 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3624 if (Size > MaxDataSize)
3625 MaxDataSize = Size;
3626 ReductionTypeArgs.emplace_back(En.value().ElementType);
3627 }
3628 Value *ReductionDataSize =
3629 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3630 if (!IsTeamsReduction) {
3631 Value *SarFuncCast =
3633 Value *WcFuncCast =
3635 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
3636 WcFuncCast};
3638 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3639 Res = Builder.CreateCall(Pv2Ptr, Args);
3640 } else {
3641 CodeGenIP = Builder.saveIP();
3642 StructType *ReductionsBufferTy = StructType::create(
3643 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3644 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3645 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3646 Function *LtGCFunc = emitListToGlobalCopyFunction(
3647 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3648 Function *LtGRFunc = emitListToGlobalReduceFunction(
3649 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3650 Function *GtLCFunc = emitGlobalToListCopyFunction(
3651 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3652 Function *GtLRFunc = emitGlobalToListReduceFunction(
3653 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3654 Builder.restoreIP(CodeGenIP);
3655
3656 Value *KernelTeamsReductionPtr = Builder.CreateCall(
3657 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3658
3659 Value *Args3[] = {SrcLocInfo,
3660 KernelTeamsReductionPtr,
3661 Builder.getInt32(ReductionBufNum),
3662 ReductionDataSize,
3663 RL,
3664 SarFunc,
3665 WcFunc,
3666 LtGCFunc,
3667 LtGRFunc,
3668 GtLCFunc,
3669 GtLRFunc};
3670
3671 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3672 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3673 Res = Builder.CreateCall(TeamsReduceFn, Args3);
3674 }
3675
3676 // 5. Build if (res == 1)
3677 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3678 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3680 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3681
3682 // 6. Build then branch: where we have reduced values in the master
3683 // thread in each team.
3684 // __kmpc_end_reduce{_nowait}(<gtid>);
3685 // break;
3686 emitBlock(ThenBB, CurFunc);
3687
3688 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3689 for (auto En : enumerate(ReductionInfos)) {
3690 const ReductionInfo &RI = En.value();
3691 Value *LHS = RI.Variable;
3692 Value *RHS =
3694
3696 Value *LHSPtr, *RHSPtr;
3698 &LHSPtr, &RHSPtr, CurFunc));
3699
3700 // Fix the CallBack code genereated to use the correct Values for the LHS
3701 // and RHS
3702 LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
3703 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3704 ReductionFunc;
3705 });
3706 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3707 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3708 ReductionFunc;
3709 });
3710 } else {
3711 assert(false && "Unhandled ReductionGenCBKind");
3712 }
3713 }
3714 emitBlock(ExitBB, CurFunc);
3715
3717
3718 return Builder.saveIP();
3719}
3720
3722 Type *VoidTy = Type::getVoidTy(M.getContext());
3723 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3724 auto *FuncTy =
3725 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3727 ".omp.reduction.func", &M);
3728}
3729
3732 InsertPointTy AllocaIP,
3733 ArrayRef<ReductionInfo> ReductionInfos,
3734 ArrayRef<bool> IsByRef, bool IsNoWait) {
3735 assert(ReductionInfos.size() == IsByRef.size());
3736 for (const ReductionInfo &RI : ReductionInfos) {
3737 (void)RI;
3738 assert(RI.Variable && "expected non-null variable");
3739 assert(RI.PrivateVariable && "expected non-null private variable");
3740 assert(RI.ReductionGen && "expected non-null reduction generator callback");
3741 assert(RI.Variable->getType() == RI.PrivateVariable->getType() &&
3742 "expected variables and their private equivalents to have the same "
3743 "type");
3744 assert(RI.Variable->getType()->isPointerTy() &&
3745 "expected variables to be pointers");
3746 }
3747
3748 if (!updateToLocation(Loc))
3749 return InsertPointTy();
3750
3751 BasicBlock *InsertBlock = Loc.IP.getBlock();
3752 BasicBlock *ContinuationBlock =
3753 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3754 InsertBlock->getTerminator()->eraseFromParent();
3755
3756 // Create and populate array of type-erased pointers to private reduction
3757 // values.
3758 unsigned NumReductions = ReductionInfos.size();
3759 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3761 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
3762
3763 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3764
3765 for (auto En : enumerate(ReductionInfos)) {
3766 unsigned Index = En.index();
3767 const ReductionInfo &RI = En.value();
3768 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
3769 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
3770 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
3771 }
3772
3773 // Emit a call to the runtime function that orchestrates the reduction.
3774 // Declare the reduction function in the process.
3776 Module *Module = Func->getParent();
3777 uint32_t SrcLocStrSize;
3778 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3779 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
3780 return RI.AtomicReductionGen;
3781 });
3782 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
3783 CanGenerateAtomic
3784 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
3785 : IdentFlag(0));
3786 Value *ThreadId = getOrCreateThreadID(Ident);
3787 Constant *NumVariables = Builder.getInt32(NumReductions);
3788 const DataLayout &DL = Module->getDataLayout();
3789 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
3790 Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
3791 Function *ReductionFunc = getFreshReductionFunc(*Module);
3792 Value *Lock = getOMPCriticalRegionLock(".reduction");
3794 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
3795 : RuntimeFunction::OMPRTL___kmpc_reduce);
3796 CallInst *ReduceCall =
3797 Builder.CreateCall(ReduceFunc,
3798 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
3799 ReductionFunc, Lock},
3800 "reduce");
3801
3802 // Create final reduction entry blocks for the atomic and non-atomic case.
3803 // Emit IR that dispatches control flow to one of the blocks based on the
3804 // reduction supporting the atomic mode.
3805 BasicBlock *NonAtomicRedBlock =
3806 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
3807 BasicBlock *AtomicRedBlock =
3808 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
3809 SwitchInst *Switch =
3810 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
3811 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
3812 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
3813
3814 // Populate the non-atomic reduction using the elementwise reduction function.
3815 // This loads the elements from the global and private variables and reduces
3816 // them before storing back the result to the global variable.
3817 Builder.SetInsertPoint(NonAtomicRedBlock);
3818 for (auto En : enumerate(ReductionInfos)) {
3819 const ReductionInfo &RI = En.value();
3821 // We have one less load for by-ref case because that load is now inside of
3822 // the reduction region
3823 Value *RedValue = RI.Variable;
3824 if (!IsByRef[En.index()]) {
3825 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3826 "red.value." + Twine(En.index()));
3827 }
3828 Value *PrivateRedValue =
3830 "red.private.value." + Twine(En.index()));
3831 Value *Reduced;
3832 InsertPointOrErrorTy AfterIP =
3833 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
3834 if (!AfterIP)
3835 return AfterIP.takeError();
3836 Builder.restoreIP(*AfterIP);
3837
3838 if (!Builder.GetInsertBlock())
3839 return InsertPointTy();
3840 // for by-ref case, the load is inside of the reduction region
3841 if (!IsByRef[En.index()])
3842 Builder.CreateStore(Reduced, RI.Variable);
3843 }
3844 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
3845 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
3846 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
3847 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
3848 Builder.CreateBr(ContinuationBlock);
3849
3850 // Populate the atomic reduction using the atomic elementwise reduction
3851 // function. There are no loads/stores here because they will be happening
3852 // inside the atomic elementwise reduction.
3853 Builder.SetInsertPoint(AtomicRedBlock);
3854 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
3855 for (const ReductionInfo &RI : ReductionInfos) {
3858 if (!AfterIP)
3859 return AfterIP.takeError();
3860 Builder.restoreIP(*AfterIP);
3861 if (!Builder.GetInsertBlock())
3862 return InsertPointTy();
3863 }
3864 Builder.CreateBr(ContinuationBlock);
3865 } else {
3867 }
3868
3869 // Populate the outlined reduction function using the elementwise reduction
3870 // function. Partial values are extracted from the type-erased array of
3871 // pointers to private variables.
3872 BasicBlock *ReductionFuncBlock =
3873 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3874 Builder.SetInsertPoint(ReductionFuncBlock);
3875 Value *LHSArrayPtr = ReductionFunc->getArg(0);
3876 Value *RHSArrayPtr = ReductionFunc->getArg(1);
3877
3878 for (auto En : enumerate(ReductionInfos)) {
3879 const ReductionInfo &RI = En.value();
3881 RedArrayTy, LHSArrayPtr, 0, En.index());
3882 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3883 Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType());
3884 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3886 RedArrayTy, RHSArrayPtr, 0, En.index());
3887 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3888 Value *RHSPtr =
3890 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3891 Value *Reduced;
3892 InsertPointOrErrorTy AfterIP =
3893 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3894 if (!AfterIP)
3895 return AfterIP.takeError();
3896 Builder.restoreIP(*AfterIP);
3897 if (!Builder.GetInsertBlock())
3898 return InsertPointTy();
3899 // store is inside of the reduction region when using by-ref
3900 if (!IsByRef[En.index()])
3901 Builder.CreateStore(Reduced, LHSPtr);
3902 }
3904
3905 Builder.SetInsertPoint(ContinuationBlock);
3906 return Builder.saveIP();
3907}
3908
3911 BodyGenCallbackTy BodyGenCB,
3912 FinalizeCallbackTy FiniCB) {
3913 if (!updateToLocation(Loc))
3914 return Loc.IP;
3915
3916 Directive OMPD = Directive::OMPD_master;
3917 uint32_t SrcLocStrSize;
3918 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3919 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3920 Value *ThreadId = getOrCreateThreadID(Ident);
3921 Value *Args[] = {Ident, ThreadId};
3922
3923 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
3924 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3925
3926 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
3927 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
3928
3929 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3930 /*Conditional*/ true, /*hasFinalize*/ true);
3931}
3932
3935 BodyGenCallbackTy BodyGenCB,
3936 FinalizeCallbackTy FiniCB, Value *Filter) {
3937 if (!updateToLocation(Loc))
3938 return Loc.IP;
3939
3940 Directive OMPD = Directive::OMPD_masked;
3941 uint32_t SrcLocStrSize;
3942 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3943 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3944 Value *ThreadId = getOrCreateThreadID(Ident);
3945 Value *Args[] = {Ident, ThreadId, Filter};
3946 Value *ArgsEnd[] = {Ident, ThreadId};
3947
3948 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
3949 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3950
3951 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
3952 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
3953
3954 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3955 /*Conditional*/ true, /*hasFinalize*/ true);
3956}
3957
3959 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
3960 BasicBlock *PostInsertBefore, const Twine &Name) {
3961 Module *M = F->getParent();
3962 LLVMContext &Ctx = M->getContext();
3963 Type *IndVarTy = TripCount->getType();
3964
3965 // Create the basic block structure.
3966 BasicBlock *Preheader =
3967 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
3968 BasicBlock *Header =
3969 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
3970 BasicBlock *Cond =
3971 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
3972 BasicBlock *Body =
3973 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
3974 BasicBlock *Latch =
3975 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
3976 BasicBlock *Exit =
3977 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
3978 BasicBlock *After =
3979 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
3980
3981 // Use specified DebugLoc for new instructions.
3983
3984 Builder.SetInsertPoint(Preheader);
3985 Builder.CreateBr(Header);
3986
3987 Builder.SetInsertPoint(Header);
3988 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
3989 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
3991
3993 Value *Cmp =
3994 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
3995 Builder.CreateCondBr(Cmp, Body, Exit);
3996
3997 Builder.SetInsertPoint(Body);
3998 Builder.CreateBr(Latch);
3999
4000 Builder.SetInsertPoint(Latch);
4001 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
4002 "omp_" + Name + ".next", /*HasNUW=*/true);
4003 Builder.CreateBr(Header);
4004 IndVarPHI->addIncoming(Next, Latch);
4005
4006 Builder.SetInsertPoint(Exit);
4008
4009 // Remember and return the canonical control flow.
4010 LoopInfos.emplace_front();
4011 CanonicalLoopInfo *CL = &LoopInfos.front();
4012
4013 CL->Header = Header;
4014 CL->Cond = Cond;
4015 CL->Latch = Latch;
4016 CL->Exit = Exit;
4017
4018#ifndef NDEBUG
4019 CL->assertOK();
4020#endif
4021 return CL;
4022}
4023
4026 LoopBodyGenCallbackTy BodyGenCB,
4027 Value *TripCount, const Twine &Name) {
4028 BasicBlock *BB = Loc.IP.getBlock();
4029 BasicBlock *NextBB = BB->getNextNode();
4030
4031 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
4032 NextBB, NextBB, Name);
4033 BasicBlock *After = CL->getAfter();
4034
4035 // If location is not set, don't connect the loop.
4036 if (updateToLocation(Loc)) {
4037 // Split the loop at the insertion point: Branch to the preheader and move
4038 // every following instruction to after the loop (the After BB). Also, the
4039 // new successor is the loop's after block.
4040 spliceBB(Builder, After, /*CreateBranch=*/false);
4042 }
4043
4044 // Emit the body content. We do it after connecting the loop to the CFG to
4045 // avoid that the callback encounters degenerate BBs.
4046 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
4047 return Err;
4048
4049#ifndef NDEBUG
4050 CL->assertOK();
4051#endif
4052 return CL;
4053}
4054
4056 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4057 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4058 InsertPointTy ComputeIP, const Twine &Name) {
4059
4060 // Consider the following difficulties (assuming 8-bit signed integers):
4061 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
4062 // DO I = 1, 100, 50
4063 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
4064 // DO I = 100, 0, -128
4065
4066 // Start, Stop and Step must be of the same integer type.
4067 auto *IndVarTy = cast<IntegerType>(Start->getType());
4068 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
4069 assert(IndVarTy == Step->getType() && "Step type mismatch");
4070
4071 LocationDescription ComputeLoc =
4072 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4073 updateToLocation(ComputeLoc);
4074
4075 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
4076 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
4077
4078 // Like Step, but always positive.
4079 Value *Incr = Step;
4080
4081 // Distance between Start and Stop; always positive.
4082 Value *Span;
4083
4084 // Condition whether there are no iterations are executed at all, e.g. because
4085 // UB < LB.
4086 Value *ZeroCmp;
4087
4088 if (IsSigned) {
4089 // Ensure that increment is positive. If not, negate and invert LB and UB.
4090 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
4091 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
4092 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
4093 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
4094 Span = Builder.CreateSub(UB, LB, "", false, true);
4095 ZeroCmp = Builder.CreateICmp(
4096 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
4097 } else {
4098 Span = Builder.CreateSub(Stop, Start, "", true);
4099 ZeroCmp = Builder.CreateICmp(
4100 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
4101 }
4102
4103 Value *CountIfLooping;
4104 if (InclusiveStop) {
4105 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
4106 } else {
4107 // Avoid incrementing past stop since it could overflow.
4108 Value *CountIfTwo = Builder.CreateAdd(
4109 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
4110 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
4111 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
4112 }
4113 Value *TripCount = Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
4114 "omp_" + Name + ".tripcount");
4115
4116 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4117 Builder.restoreIP(CodeGenIP);
4118 Value *Span = Builder.CreateMul(IV, Step);
4119 Value *IndVar = Builder.CreateAdd(Span, Start);
4120 return BodyGenCB(Builder.saveIP(), IndVar);
4121 };
4122 LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP();
4123 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
4124}
4125
4126// Returns an LLVM function to call for initializing loop bounds using OpenMP
4127// static scheduling depending on `type`. Only i32 and i64 are supported by the
4128// runtime. Always interpret integers as unsigned similarly to
4129// CanonicalLoopInfo.
4131 OpenMPIRBuilder &OMPBuilder) {
4132 unsigned Bitwidth = Ty->getIntegerBitWidth();
4133 if (Bitwidth == 32)
4134 return OMPBuilder.getOrCreateRuntimeFunction(
4135 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
4136 if (Bitwidth == 64)
4137 return OMPBuilder.getOrCreateRuntimeFunction(
4138 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
4139 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4140}
4141
4143OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
4144 InsertPointTy AllocaIP,
4145 bool NeedsBarrier) {
4146 assert(CLI->isValid() && "Requires a valid canonical loop");
4147 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4148 "Require dedicated allocate IP");
4149
4150 // Set up the source location value for OpenMP runtime.
4153
4154 uint32_t SrcLocStrSize;
4155 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4156 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4157
4158 // Declare useful OpenMP runtime functions.
4159 Value *IV = CLI->getIndVar();
4160 Type *IVTy = IV->getType();
4161 FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this);
4162 FunctionCallee StaticFini =
4163 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4164
4165 // Allocate space for computed loop bounds as expected by the "init" function.
4166 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4167
4168 Type *I32Type = Type::getInt32Ty(M.getContext());
4169 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4170 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4171 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4172 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4173
4174 // At the end of the preheader, prepare for calling the "init" function by
4175 // storing the current loop bounds into the allocated space. A canonical loop
4176 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4177 // and produces an inclusive upper bound.
4179 Constant *Zero = ConstantInt::get(IVTy, 0);
4180 Constant *One = ConstantInt::get(IVTy, 1);
4181 Builder.CreateStore(Zero, PLowerBound);
4182 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
4183 Builder.CreateStore(UpperBound, PUpperBound);
4184 Builder.CreateStore(One, PStride);
4185
4186 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4187
4188 Constant *SchedulingType = ConstantInt::get(
4189 I32Type, static_cast<int>(OMPScheduleType::UnorderedStatic));
4190
4191 // Call the "init" function and update the trip count of the loop with the
4192 // value it produced.
4193 Builder.CreateCall(StaticInit,
4194 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
4195 PUpperBound, PStride, One, Zero});
4196 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
4197 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
4198 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
4199 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
4200 CLI->setTripCount(TripCount);
4201
4202 // Update all uses of the induction variable except the one in the condition
4203 // block that compares it with the actual upper bound, and the increment in
4204 // the latch block.
4205
4206 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
4208 CLI->getBody()->getFirstInsertionPt());
4210 return Builder.CreateAdd(OldIV, LowerBound);
4211 });
4212
4213 // In the "exit" block, call the "fini" function.
4215 CLI->getExit()->getTerminator()->getIterator());
4216 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4217
4218 // Add the barrier if requested.
4219 if (NeedsBarrier) {
4220 InsertPointOrErrorTy BarrierIP =
4221 createBarrier(LocationDescription(Builder.saveIP(), DL),
4222 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4223 /* CheckCancelFlag */ false);
4224 if (!BarrierIP)
4225 return BarrierIP.takeError();
4226 }
4227
4228 InsertPointTy AfterIP = CLI->getAfterIP();
4229 CLI->invalidate();
4230
4231 return AfterIP;
4232}
4233
4235OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL,
4236 CanonicalLoopInfo *CLI,
4237 InsertPointTy AllocaIP,
4238 bool NeedsBarrier,
4239 Value *ChunkSize) {
4240 assert(CLI->isValid() && "Requires a valid canonical loop");
4241 assert(ChunkSize && "Chunk size is required");
4242
4243 LLVMContext &Ctx = CLI->getFunction()->getContext();
4244 Value *IV = CLI->getIndVar();
4245 Value *OrigTripCount = CLI->getTripCount();
4246 Type *IVTy = IV->getType();
4247 assert(IVTy->getIntegerBitWidth() <= 64 &&
4248 "Max supported tripcount bitwidth is 64 bits");
4249 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4250 : Type::getInt64Ty(Ctx);
4251 Type *I32Type = Type::getInt32Ty(M.getContext());
4252 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4253 Constant *One = ConstantInt::get(InternalIVTy, 1);
4254
4255 // Declare useful OpenMP runtime functions.
4256 FunctionCallee StaticInit =
4257 getKmpcForStaticInitForType(InternalIVTy, M, *this);
4258 FunctionCallee StaticFini =
4259 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4260
4261 // Allocate space for computed loop bounds as expected by the "init" function.
4262 Builder.restoreIP(AllocaIP);
4264 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4265 Value *PLowerBound =
4266 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
4267 Value *PUpperBound =
4268 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
4269 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
4270
4271 // Set up the source location value for the OpenMP runtime.
4274
4275 // TODO: Detect overflow in ubsan or max-out with current tripcount.
4276 Value *CastedChunkSize =
4277 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
4278 Value *CastedTripCount =
4279 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
4280
4281 Constant *SchedulingType = ConstantInt::get(
4282 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
4283 Builder.CreateStore(Zero, PLowerBound);
4284 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
4285 Builder.CreateStore(OrigUpperBound, PUpperBound);
4286 Builder.CreateStore(One, PStride);
4287
4288 // Call the "init" function and update the trip count of the loop with the
4289 // value it produced.
4290 uint32_t SrcLocStrSize;
4291 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4292 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4293 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4294 Builder.CreateCall(StaticInit,
4295 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
4296 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
4297 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
4298 /*pstride=*/PStride, /*incr=*/One,
4299 /*chunk=*/CastedChunkSize});
4300
4301 // Load values written by the "init" function.
4302 Value *FirstChunkStart =
4303 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
4304 Value *FirstChunkStop =
4305 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
4306 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
4307 Value *ChunkRange =
4308 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
4309 Value *NextChunkStride =
4310 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
4311
4312 // Create outer "dispatch" loop for enumerating the chunks.
4313 BasicBlock *DispatchEnter = splitBB(Builder, true);
4314 Value *DispatchCounter;
4315
4316 // It is safe to assume this didn't return an error because the callback
4317 // passed into createCanonicalLoop is the only possible error source, and it
4318 // always returns success.
4320 {Builder.saveIP(), DL},
4321 [&](InsertPointTy BodyIP, Value *Counter) {
4322 DispatchCounter = Counter;
4323 return Error::success();
4324 },
4325 FirstChunkStart, CastedTripCount, NextChunkStride,
4326 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
4327 "dispatch"));
4328
4329 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
4330 // not have to preserve the canonical invariant.
4331 BasicBlock *DispatchBody = DispatchCLI->getBody();
4332 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
4333 BasicBlock *DispatchExit = DispatchCLI->getExit();
4334 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
4335 DispatchCLI->invalidate();
4336
4337 // Rewire the original loop to become the chunk loop inside the dispatch loop.
4338 redirectTo(DispatchAfter, CLI->getAfter(), DL);
4339 redirectTo(CLI->getExit(), DispatchLatch, DL);
4340 redirectTo(DispatchBody, DispatchEnter, DL);
4341
4342 // Prepare the prolog of the chunk loop.
4345
4346 // Compute the number of iterations of the chunk loop.
4348 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
4349 Value *IsLastChunk =
4350 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
4351 Value *CountUntilOrigTripCount =
4352 Builder.CreateSub(CastedTripCount, DispatchCounter);
4353 Value *ChunkTripCount = Builder.CreateSelect(
4354 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
4355 Value *BackcastedChunkTC =
4356 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
4357 CLI->setTripCount(BackcastedChunkTC);
4358
4359 // Update all uses of the induction variable except the one in the condition
4360 // block that compares it with the actual upper bound, and the increment in
4361 // the latch block.
4362 Value *BackcastedDispatchCounter =
4363 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
4364 CLI->mapIndVar([&](Instruction *) -> Value * {
4365 Builder.restoreIP(CLI->getBodyIP());
4366 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
4367 });
4368
4369 // In the "exit" block, call the "fini" function.
4370 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
4371 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4372
4373 // Add the barrier if requested.
4374 if (NeedsBarrier) {
4375 InsertPointOrErrorTy AfterIP =
4376 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
4377 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
4378 if (!AfterIP)
4379 return AfterIP.takeError();
4380 }
4381
4382#ifndef NDEBUG
4383 // Even though we currently do not support applying additional methods to it,
4384 // the chunk loop should remain a canonical loop.
4385 CLI->assertOK();
4386#endif
4387
4388 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
4389}
4390
4391// Returns an LLVM function to call for executing an OpenMP static worksharing
4392// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
4393// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
4394static FunctionCallee
4396 WorksharingLoopType LoopType) {
4397 unsigned Bitwidth = Ty->getIntegerBitWidth();
4398 Module &M = OMPBuilder->M;
4399 switch (LoopType) {
4400 case WorksharingLoopType::ForStaticLoop:
4401 if (Bitwidth == 32)
4402 return OMPBuilder->getOrCreateRuntimeFunction(
4403 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
4404 if (Bitwidth == 64)
4405 return OMPBuilder->getOrCreateRuntimeFunction(
4406 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
4407 break;
4408 case WorksharingLoopType::DistributeStaticLoop:
4409 if (Bitwidth == 32)
4410 return OMPBuilder->getOrCreateRuntimeFunction(
4411 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
4412 if (Bitwidth == 64)
4413 return OMPBuilder->getOrCreateRuntimeFunction(
4414 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
4415 break;
4416 case WorksharingLoopType::DistributeForStaticLoop:
4417 if (Bitwidth == 32)
4418 return OMPBuilder->getOrCreateRuntimeFunction(
4419 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
4420 if (Bitwidth == 64)
4421 return OMPBuilder->getOrCreateRuntimeFunction(
4422 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
4423 break;
4424 }
4425 if (Bitwidth != 32 && Bitwidth != 64) {
4426 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
4427 }
4428 llvm_unreachable("Unknown type of OpenMP worksharing loop");
4429}
4430
4431// Inserts a call to proper OpenMP Device RTL function which handles
4432// loop worksharing.
4434 OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
4435 BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
4436 Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
4437 Type *TripCountTy = TripCount->getType();
4438 Module &M = OMPBuilder->M;
4439 IRBuilder<> &Builder = OMPBuilder->Builder;
4440 FunctionCallee RTLFn =
4441 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
4442 SmallVector<Value *, 8> RealArgs;
4443 RealArgs.push_back(Ident);
4444 RealArgs.push_back(Builder.CreateBitCast(&LoopBodyFn, ParallelTaskPtr));
4445 RealArgs.push_back(LoopBodyArg);
4446 RealArgs.push_back(TripCount);
4447 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
4448 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4449 Builder.CreateCall(RTLFn, RealArgs);
4450 return;
4451 }
4452 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
4453 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
4454 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
4455 Value *NumThreads = Builder.CreateCall(RTLNumThreads, {});
4456
4457 RealArgs.push_back(
4458 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
4459 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4460 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4461 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4462 }
4463
4464 Builder.CreateCall(RTLFn, RealArgs);
4465}
4466
4467static void
4469 CanonicalLoopInfo *CLI, Value *Ident,
4470 Function &OutlinedFn, Type *ParallelTaskPtr,
4471 const SmallVector<Instruction *, 4> &ToBeDeleted,
4472 WorksharingLoopType LoopType) {
4473 IRBuilder<> &Builder = OMPIRBuilder->Builder;
4474 BasicBlock *Preheader = CLI->getPreheader();
4475 Value *TripCount = CLI->getTripCount();
4476
4477 // After loop body outling, the loop body contains only set up
4478 // of loop body argument structure and the call to the outlined
4479 // loop body function. Firstly, we need to move setup of loop body args
4480 // into loop preheader.
4481 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
4482 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
4483
4484 // The next step is to remove the whole loop. We do not it need anymore.
4485 // That's why make an unconditional branch from loop preheader to loop
4486 // exit block
4487 Builder.restoreIP({Preheader, Preheader->end()});
4488 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
4489 Preheader->getTerminator()->eraseFromParent();
4490 Builder.CreateBr(CLI->getExit());
4491
4492 // Delete dead loop blocks
4493 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
4494 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
4495 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
4496 CleanUpInfo.EntryBB = CLI->getHeader();
4497 CleanUpInfo.ExitBB = CLI->getExit();
4498 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
4499 DeleteDeadBlocks(BlocksToBeRemoved);
4500
4501 // Find the instruction which corresponds to loop body argument structure
4502 // and remove the call to loop body function instruction.
4503 Value *LoopBodyArg;
4504 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
4505 assert(OutlinedFnUser &&
4506 "Expected unique undroppable user of outlined function");
4507 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
4508 assert(OutlinedFnCallInstruction && "Expected outlined function call");
4509 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
4510 "Expected outlined function call to be located in loop preheader");
4511 // Check in case no argument structure has been passed.
4512 if (OutlinedFnCallInstruction->arg_size() > 1)
4513 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
4514 else
4515 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
4516 OutlinedFnCallInstruction->eraseFromParent();
4517
4518 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
4519 LoopBodyArg, ParallelTaskPtr, TripCount,
4520 OutlinedFn);
4521
4522 for (auto &ToBeDeletedItem : ToBeDeleted)
4523 ToBeDeletedItem->eraseFromParent();
4524 CLI->invalidate();
4525}
4526
4528OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
4529 InsertPointTy AllocaIP,
4530 WorksharingLoopType LoopType) {
4531 uint32_t SrcLocStrSize;
4532 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4533 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4534
4535 OutlineInfo OI;
4536 OI.OuterAllocaBB = CLI->getPreheader();
4537 Function *OuterFn = CLI->getPreheader()->getParent();
4538
4539 // Instructions which need to be deleted at the end of code generation
4541
4542 OI.OuterAllocaBB = AllocaIP.getBlock();
4543
4544 // Mark the body loop as region which needs to be extracted
4545 OI.EntryBB = CLI->getBody();
4546 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
4547 "omp.prelatch", true);
4548
4549 // Prepare loop body for extraction
4550 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
4551
4552 // Insert new loop counter variable which will be used only in loop
4553 // body.
4554 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
4555 Instruction *NewLoopCntLoad =
4556 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
4557 // New loop counter instructions are redundant in the loop preheader when
4558 // code generation for workshare loop is finshed. That's why mark them as
4559 // ready for deletion.
4560 ToBeDeleted.push_back(NewLoopCntLoad);
4561 ToBeDeleted.push_back(NewLoopCnt);
4562
4563 // Analyse loop body region. Find all input variables which are used inside
4564 // loop body region.
4565 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
4567 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
4568 SmallVector<BasicBlock *, 32> BlocksT(ParallelRegionBlockSet.begin(),
4569 ParallelRegionBlockSet.end());
4570
4571 CodeExtractorAnalysisCache CEAC(*OuterFn);
4572 CodeExtractor Extractor(Blocks,
4573 /* DominatorTree */ nullptr,
4574 /* AggregateArgs */ true,
4575 /* BlockFrequencyInfo */ nullptr,
4576 /* BranchProbabilityInfo */ nullptr,
4577 /* AssumptionCache */ nullptr,
4578 /* AllowVarArgs */ true,
4579 /* AllowAlloca */ true,
4580 /* AllocationBlock */ CLI->getPreheader(),
4581 /* Suffix */ ".omp_wsloop",
4582 /* AggrArgsIn0AddrSpace */ true);
4583
4584 BasicBlock *CommonExit = nullptr;
4585 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
4586
4587 // Find allocas outside the loop body region which are used inside loop
4588 // body
4589 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
4590
4591 // We need to model loop body region as the function f(cnt, loop_arg).
4592 // That's why we replace loop induction variable by the new counter
4593 // which will be one of loop body function argument
4595 CLI->getIndVar()->user_end());
4596 for (auto Use : Users) {
4597 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
4598 if (ParallelRegionBlockSet.count(Inst->getParent())) {
4599 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
4600 }
4601 }
4602 }
4603 // Make sure that loop counter variable is not merged into loop body
4604 // function argument structure and it is passed as separate variable
4605 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
4606
4607 // PostOutline CB is invoked when loop body function is outlined and
4608 // loop body is replaced by call to outlined function. We need to add
4609 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
4610 // function will handle loop control logic.
4611 //
4612 OI.PostOutlineCB = [=, ToBeDeletedVec =
4613 std::move(ToBeDeleted)](Function &OutlinedFn) {
4614 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ParallelTaskPtr,
4615 ToBeDeletedVec, LoopType);
4616 };
4617 addOutlineInfo(std::move(OI));
4618 return CLI->getAfterIP();
4619}
4620
4623 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
4624 bool HasSimdModifier, bool HasMonotonicModifier,
4625 bool HasNonmonotonicModifier, bool HasOrderedClause,
4626 WorksharingLoopType LoopType) {
4627 if (Config.isTargetDevice())
4628 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
4629 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
4630 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
4631 HasNonmonotonicModifier, HasOrderedClause);
4632
4633 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
4634 OMPScheduleType::ModifierOrdered;
4635 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
4636 case OMPScheduleType::BaseStatic:
4637 assert(!ChunkSize && "No chunk size with static-chunked schedule");
4638 if (IsOrdered)
4639 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4640 NeedsBarrier, ChunkSize);
4641 // FIXME: Monotonicity ignored?
4642 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier);
4643
4644 case OMPScheduleType::BaseStaticChunked:
4645 if (IsOrdered)
4646 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4647 NeedsBarrier, ChunkSize);
4648 // FIXME: Monotonicity ignored?
4649 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
4650 ChunkSize);
4651
4652 case OMPScheduleType::BaseRuntime:
4653 case OMPScheduleType::BaseAuto:
4654 case OMPScheduleType::BaseGreedy:
4655 case OMPScheduleType::BaseBalanced:
4656 case OMPScheduleType::BaseSteal:
4657 case OMPScheduleType::BaseGuidedSimd:
4658 case OMPScheduleType::BaseRuntimeSimd:
4659 assert(!ChunkSize &&
4660 "schedule type does not support user-defined chunk sizes");
4661 [[fallthrough]];
4662 case OMPScheduleType::BaseDynamicChunked:
4663 case OMPScheduleType::BaseGuidedChunked:
4664 case OMPScheduleType::BaseGuidedIterativeChunked:
4665 case OMPScheduleType::BaseGuidedAnalyticalChunked:
4666 case OMPScheduleType::BaseStaticBalancedChunked:
4667 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4668 NeedsBarrier, ChunkSize);
4669
4670 default:
4671 llvm_unreachable("Unknown/unimplemented schedule kind");
4672 }
4673}
4674
4675/// Returns an LLVM function to call for initializing loop bounds using OpenMP
4676/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
4677/// the runtime. Always interpret integers as unsigned similarly to
4678/// CanonicalLoopInfo.
4679static FunctionCallee
4681 unsigned Bitwidth = Ty->getIntegerBitWidth();
4682 if (Bitwidth == 32)
4683 return OMPBuilder.getOrCreateRuntimeFunction(
4684 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
4685 if (Bitwidth == 64)
4686 return OMPBuilder.getOrCreateRuntimeFunction(
4687 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
4688 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4689}
4690
4691/// Returns an LLVM function to call for updating the next loop using OpenMP
4692/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
4693/// the runtime. Always interpret integers as unsigned similarly to
4694/// CanonicalLoopInfo.
4695static FunctionCallee
4697 unsigned Bitwidth = Ty->getIntegerBitWidth();
4698 if (Bitwidth == 32)
4699 return OMPBuilder.getOrCreateRuntimeFunction(
4700 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
4701 if (Bitwidth == 64)
4702 return OMPBuilder.getOrCreateRuntimeFunction(
4703 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
4704 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4705}
4706
4707/// Returns an LLVM function to call for finalizing the dynamic loop using
4708/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
4709/// interpret integers as unsigned similarly to CanonicalLoopInfo.
4710static FunctionCallee
4712 unsigned Bitwidth = Ty->getIntegerBitWidth();
4713 if (Bitwidth == 32)
4714 return OMPBuilder.getOrCreateRuntimeFunction(
4715 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
4716 if (Bitwidth == 64)
4717 return OMPBuilder.getOrCreateRuntimeFunction(
4718 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
4719 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4720}
4721
4723OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
4724 InsertPointTy AllocaIP,
4725 OMPScheduleType SchedType,
4726 bool NeedsBarrier, Value *Chunk) {
4727 assert(CLI->isValid() && "Requires a valid canonical loop");
4728 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4729 "Require dedicated allocate IP");
4731 "Require valid schedule type");
4732
4733 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
4734 OMPScheduleType::ModifierOrdered;
4735
4736 // Set up the source location value for OpenMP runtime.
4738
4739 uint32_t SrcLocStrSize;
4740 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4741 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4742
4743 // Declare useful OpenMP runtime functions.
4744 Value *IV = CLI->getIndVar();
4745 Type *IVTy = IV->getType();
4746 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
4747 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
4748
4749 // Allocate space for computed loop bounds as expected by the "init" function.
4750 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4751 Type *I32Type = Type::getInt32Ty(M.getContext());
4752 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4753 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4754 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4755 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4756
4757 // At the end of the preheader, prepare for calling the "init" function by
4758 // storing the current loop bounds into the allocated space. A canonical loop
4759 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4760 // and produces an inclusive upper bound.
4761 BasicBlock *PreHeader = CLI->getPreheader();
4762 Builder.SetInsertPoint(PreHeader->getTerminator());
4763 Constant *One = ConstantInt::get(IVTy, 1);
4764 Builder.CreateStore(One, PLowerBound);
4765 Value *UpperBound = CLI->getTripCount();
4766 Builder.CreateStore(UpperBound, PUpperBound);
4767 Builder.CreateStore(One, PStride);
4768
4769 BasicBlock *Header = CLI->getHeader();
4770 BasicBlock *Exit = CLI->getExit();
4771 BasicBlock *Cond = CLI->getCond();
4772 BasicBlock *Latch = CLI->getLatch();
4773 InsertPointTy AfterIP = CLI->getAfterIP();
4774
4775 // The CLI will be "broken" in the code below, as the loop is no longer
4776 // a valid canonical loop.
4777
4778 if (!Chunk)
4779 Chunk = One;
4780
4781 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4782
4783 Constant *SchedulingType =
4784 ConstantInt::get(I32Type, static_cast<int>(SchedType));
4785
4786 // Call the "init" function.
4787 Builder.CreateCall(DynamicInit,
4788 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
4789 UpperBound, /* step */ One, Chunk});
4790
4791 // An outer loop around the existing one.
4792 BasicBlock *OuterCond = BasicBlock::Create(
4793 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
4794 PreHeader->getParent());
4795 // This needs to be 32-bit always, so can't use the IVTy Zero above.
4796 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
4797 Value *Res =
4798 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
4799 PLowerBound, PUpperBound, PStride});
4800 Constant *Zero32 = ConstantInt::get(I32Type, 0);
4801 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
4802 Value *LowerBound =
4803 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
4804 Builder.CreateCondBr(MoreWork, Header, Exit);
4805
4806 // Change PHI-node in loop header to use outer cond rather than preheader,
4807 // and set IV to the LowerBound.
4808 Instruction *Phi = &Header->front();
4809 auto *PI = cast<PHINode>(Phi);
4810 PI->setIncomingBlock(0, OuterCond);
4811 PI->setIncomingValue(0, LowerBound);
4812
4813 // Then set the pre-header to jump to the OuterCond
4814 Instruction *Term = PreHeader->getTerminator();
4815 auto *Br = cast<BranchInst>(Term);
4816 Br->setSuccessor(0, OuterCond);
4817
4818 // Modify the inner condition:
4819 // * Use the UpperBound returned from the DynamicNext call.
4820 // * jump to the loop outer loop when done with one of the inner loops.
4821 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
4822 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
4824 auto *CI = cast<CmpInst>(Comp);
4825 CI->setOperand(1, UpperBound);
4826 // Redirect the inner exit to branch to outer condition.
4827 Instruction *Branch = &Cond->back();
4828 auto *BI = cast<BranchInst>(Branch);
4829 assert(BI->getSuccessor(1) == Exit);
4830 BI->setSuccessor(1, OuterCond);
4831
4832 // Call the "fini" function if "ordered" is present in wsloop directive.
4833 if (Ordered) {
4834 Builder.SetInsertPoint(&Latch->back());
4835 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
4836 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
4837 }
4838
4839 // Add the barrier if requested.
4840 if (NeedsBarrier) {
4841 Builder.SetInsertPoint(&Exit->back());
4842 InsertPointOrErrorTy BarrierIP =
4843 createBarrier(LocationDescription(Builder.saveIP(), DL),
4844 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4845 /* CheckCancelFlag */ false);
4846 if (!BarrierIP)
4847 return BarrierIP.takeError();
4848 }
4849
4850 CLI->invalidate();
4851 return AfterIP;
4852}
4853
4854/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
4855/// after this \p OldTarget will be orphaned.
4857 BasicBlock *NewTarget, DebugLoc DL) {
4858 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
4859 redirectTo(Pred, NewTarget, DL);
4860}
4861
4862/// Determine which blocks in \p BBs are reachable from outside and remove the
4863/// ones that are not reachable from the function.
4865 SmallPtrSet<BasicBlock *, 6> BBsToErase{BBs.begin(), BBs.end()};
4866 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
4867 for (Use &U : BB->uses()) {
4868 auto *UseInst = dyn_cast<Instruction>(U.getUser());
4869 if (!UseInst)
4870 continue;
4871 if (BBsToErase.count(UseInst->getParent()))
4872 continue;
4873 return true;
4874 }
4875 return false;
4876 };
4877
4878 while (BBsToErase.remove_if(HasRemainingUses)) {
4879 // Try again if anything was removed.
4880 }
4881
4882 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
4883 DeleteDeadBlocks(BBVec);
4884}
4885
4888 InsertPointTy ComputeIP) {
4889 assert(Loops.size() >= 1 && "At least one loop required");
4890 size_t NumLoops = Loops.size();
4891
4892 // Nothing to do if there is already just one loop.
4893 if (NumLoops == 1)
4894 return Loops.front();
4895
4896 CanonicalLoopInfo *Outermost = Loops.front();
4897 CanonicalLoopInfo *Innermost = Loops.back();
4898 BasicBlock *OrigPreheader = Outermost->getPreheader();
4899 BasicBlock *OrigAfter = Outermost->getAfter();
4900 Function *F = OrigPreheader->getParent();
4901
4902 // Loop control blocks that may become orphaned later.
4903 SmallVector<BasicBlock *, 12> OldControlBBs;
4904 OldControlBBs.reserve(6 * Loops.size());
4906 Loop->collectControlBlocks(OldControlBBs);
4907
4908 // Setup the IRBuilder for inserting the trip count computation.
4910 if (ComputeIP.isSet())
4911 Builder.restoreIP(ComputeIP);
4912 else
4913 Builder.restoreIP(Outermost->getPreheaderIP());
4914
4915 // Derive the collapsed' loop trip count.
4916 // TODO: Find common/largest indvar type.
4917 Value *CollapsedTripCount = nullptr;
4918 for (CanonicalLoopInfo *L : Loops) {
4919 assert(L->isValid() &&
4920 "All loops to collapse must be valid canonical loops");
4921 Value *OrigTripCount = L->getTripCount();
4922 if (!CollapsedTripCount) {
4923 CollapsedTripCount = OrigTripCount;
4924 continue;
4925 }
4926
4927 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
4928 CollapsedTripCount = Builder.CreateMul(CollapsedTripCount, OrigTripCount,
4929 {}, /*HasNUW=*/true);
4930 }
4931
4932 // Create the collapsed loop control flow.
4933 CanonicalLoopInfo *Result =
4934 createLoopSkeleton(DL, CollapsedTripCount, F,
4935 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
4936
4937 // Build the collapsed loop body code.
4938 // Start with deriving the input loop induction variables from the collapsed
4939 // one, using a divmod scheme. To preserve the original loops' order, the
4940 // innermost loop use the least significant bits.
4941 Builder.restoreIP(Result->getBodyIP());
4942
4943 Value *Leftover = Result->getIndVar();
4944 SmallVector<Value *> NewIndVars;
4945 NewIndVars.resize(NumLoops);
4946 for (int i = NumLoops - 1; i >= 1; --i) {
4947 Value *OrigTripCount = Loops[i]->getTripCount();
4948
4949 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
4950 NewIndVars[i] = NewIndVar;
4951
4952 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
4953 }
4954 // Outermost loop gets all the remaining bits.
4955 NewIndVars[0] = Leftover;
4956
4957 // Construct the loop body control flow.
4958 // We progressively construct the branch structure following in direction of
4959 // the control flow, from the leading in-between code, the loop nest body, the
4960 // trailing in-between code, and rejoining the collapsed loop's latch.
4961 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
4962 // the ContinueBlock is set, continue with that block. If ContinuePred, use
4963 // its predecessors as sources.
4964 BasicBlock *ContinueBlock = Result->getBody();
4965 BasicBlock *ContinuePred = nullptr;
4966 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
4967 BasicBlock *NextSrc) {
4968 if (ContinueBlock)
4969 redirectTo(ContinueBlock, Dest, DL);
4970 else
4971 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
4972
4973 ContinueBlock = nullptr;
4974 ContinuePred = NextSrc;
4975 };
4976
4977 // The code before the nested loop of each level.
4978 // Because we are sinking it into the nest, it will be executed more often
4979 // that the original loop. More sophisticated schemes could keep track of what
4980 // the in-between code is and instantiate it only once per thread.
4981 for (size_t i = 0; i < NumLoops - 1; ++i)
4982 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
4983
4984 // Connect the loop nest body.
4985 ContinueWith(Innermost->getBody(), Innermost->getLatch());
4986
4987 // The code after the nested loop at each level.
4988 for (size_t i = NumLoops - 1; i > 0; --i)
4989 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
4990
4991 // Connect the finished loop to the collapsed loop latch.
4992 ContinueWith(Result->getLatch(), nullptr);
4993
4994 // Replace the input loops with the new collapsed loop.
4995 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
4996 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
4997
4998 // Replace the input loop indvars with the derived ones.
4999 for (size_t i = 0; i < NumLoops; ++i)
5000 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
5001
5002 // Remove unused parts of the input loops.
5003 removeUnusedBlocksFromParent(OldControlBBs);
5004
5005 for (CanonicalLoopInfo *L : Loops)
5006 L->invalidate();
5007
5008#ifndef NDEBUG
5009 Result->assertOK();
5010#endif
5011 return Result;
5012}
5013
5014std::vector<CanonicalLoopInfo *>
5016 ArrayRef<Value *> TileSizes) {
5017 assert(TileSizes.size() == Loops.size() &&
5018 "Must pass as many tile sizes as there are loops");
5019 int NumLoops = Loops.size();
5020 assert(NumLoops >= 1 && "At least one loop to tile required");
5021
5022 CanonicalLoopInfo *OutermostLoop = Loops.front();
5023 CanonicalLoopInfo *InnermostLoop = Loops.back();
5024 Function *F = OutermostLoop->getBody()->getParent();
5025 BasicBlock *InnerEnter = InnermostLoop->getBody();
5026 BasicBlock *InnerLatch = InnermostLoop->getLatch();
5027
5028 // Loop control blocks that may become orphaned later.
5029 SmallVector<BasicBlock *, 12> OldControlBBs;
5030 OldControlBBs.reserve(6 * Loops.size());
5032 Loop->collectControlBlocks(OldControlBBs);
5033
5034 // Collect original trip counts and induction variable to be accessible by
5035 // index. Also, the structure of the original loops is not preserved during
5036 // the construction of the tiled loops, so do it before we scavenge the BBs of
5037 // any original CanonicalLoopInfo.
5038 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
5039 for (CanonicalLoopInfo *L : Loops) {
5040 assert(L->isValid() && "All input loops must be valid canonical loops");
5041 OrigTripCounts.push_back(L->getTripCount());
5042 OrigIndVars.push_back(L->getIndVar());
5043 }
5044
5045 // Collect the code between loop headers. These may contain SSA definitions
5046 // that are used in the loop nest body. To be usable with in the innermost
5047 // body, these BasicBlocks will be sunk into the loop nest body. That is,
5048 // these instructions may be executed more often than before the tiling.
5049 // TODO: It would be sufficient to only sink them into body of the
5050 // corresponding tile loop.
5052 for (int i = 0; i < NumLoops - 1; ++i) {
5053 CanonicalLoopInfo *Surrounding = Loops[i];
5054 CanonicalLoopInfo *Nested = Loops[i + 1];
5055
5056 BasicBlock *EnterBB = Surrounding->getBody();
5057 BasicBlock *ExitBB = Nested->getHeader();
5058 InbetweenCode.emplace_back(EnterBB, ExitBB);
5059 }
5060
5061 // Compute the trip counts of the floor loops.
5063 Builder.restoreIP(OutermostLoop->getPreheaderIP());
5064 SmallVector<Value *, 4> FloorCount, FloorRems;
5065 for (int i = 0; i < NumLoops; ++i) {
5066 Value *TileSize = TileSizes[i];
5067 Value *OrigTripCount = OrigTripCounts[i];
5068 Type *IVType = OrigTripCount->getType();
5069
5070 Value *FloorTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
5071 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
5072
5073 // 0 if tripcount divides the tilesize, 1 otherwise.
5074 // 1 means we need an additional iteration for a partial tile.
5075 //
5076 // Unfortunately we cannot just use the roundup-formula
5077 // (tripcount + tilesize - 1)/tilesize
5078 // because the summation might overflow. We do not want introduce undefined
5079 // behavior when the untiled loop nest did not.
5080 Value *FloorTripOverflow =
5081 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
5082
5083 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
5084 FloorTripCount =
5085 Builder.CreateAdd(FloorTripCount, FloorTripOverflow,
5086 "omp_floor" + Twine(i) + ".tripcount", true);
5087
5088 // Remember some values for later use.
5089 FloorCount.push_back(FloorTripCount);
5090 FloorRems.push_back(FloorTripRem);
5091 }
5092
5093 // Generate the new loop nest, from the outermost to the innermost.
5094 std::vector<CanonicalLoopInfo *> Result;
5095 Result.reserve(NumLoops * 2);
5096
5097 // The basic block of the surrounding loop that enters the nest generated
5098 // loop.
5099 BasicBlock *Enter = OutermostLoop->getPreheader();
5100
5101 // The basic block of the surrounding loop where the inner code should
5102 // continue.
5103 BasicBlock *Continue = OutermostLoop->getAfter();
5104
5105 // Where the next loop basic block should be inserted.
5106 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
5107
5108 auto EmbeddNewLoop =
5109 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
5110 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
5111 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
5112 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
5113 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
5114 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
5115
5116 // Setup the position where the next embedded loop connects to this loop.
5117 Enter = EmbeddedLoop->getBody();
5118 Continue = EmbeddedLoop->getLatch();
5119 OutroInsertBefore = EmbeddedLoop->getLatch();
5120 return EmbeddedLoop;
5121 };
5122
5123 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
5124 const Twine &NameBase) {
5125 for (auto P : enumerate(TripCounts)) {
5126 CanonicalLoopInfo *EmbeddedLoop =
5127 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
5128 Result.push_back(EmbeddedLoop);
5129 }
5130 };
5131
5132 EmbeddNewLoops(FloorCount, "floor");
5133
5134 // Within the innermost floor loop, emit the code that computes the tile
5135 // sizes.
5137 SmallVector<Value *, 4> TileCounts;
5138 for (int i = 0; i < NumLoops; ++i) {
5139 CanonicalLoopInfo *FloorLoop = Result[i];
5140 Value *TileSize = TileSizes[i];
5141
5142 Value *FloorIsEpilogue =
5143 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCount[i]);
5144 Value *TileTripCount =
5145 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
5146
5147 TileCounts.push_back(TileTripCount);
5148 }
5149
5150 // Create the tile loops.
5151 EmbeddNewLoops(TileCounts, "tile");
5152
5153 // Insert the inbetween code into the body.
5154 BasicBlock *BodyEnter = Enter;
5155 BasicBlock *BodyEntered = nullptr;
5156 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
5157 BasicBlock *EnterBB = P.first;
5158 BasicBlock *ExitBB = P.second;
5159
5160 if (BodyEnter)
5161 redirectTo(BodyEnter, EnterBB, DL);
5162 else
5163 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
5164
5165 BodyEnter = nullptr;
5166 BodyEntered = ExitBB;
5167 }
5168
5169 // Append the original loop nest body into the generated loop nest body.
5170 if (BodyEnter)
5171 redirectTo(BodyEnter, InnerEnter, DL);
5172 else
5173 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
5175
5176 // Replace the original induction variable with an induction variable computed
5177 // from the tile and floor induction variables.
5178 Builder.restoreIP(Result.back()->getBodyIP());
5179 for (int i = 0; i < NumLoops; ++i) {
5180 CanonicalLoopInfo *FloorLoop = Result[i];
5181 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
5182 Value *OrigIndVar = OrigIndVars[i];
5183 Value *Size = TileSizes[i];
5184
5185 Value *Scale =
5186 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
5187 Value *Shift =
5188 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
5189 OrigIndVar->replaceAllUsesWith(Shift);
5190 }
5191
5192 // Remove unused parts of the original loops.
5193 removeUnusedBlocksFromParent(OldControlBBs);
5194
5195 for (CanonicalLoopInfo *L : Loops)
5196 L->invalidate();
5197
5198#ifndef NDEBUG
5199 for (CanonicalLoopInfo *GenL : Result)
5200 GenL->assertOK();
5201#endif
5202 return Result;
5203}
5204
5205/// Attach metadata \p Properties to the basic block described by \p BB. If the
5206/// basic block already has metadata, the basic block properties are appended.
5208 ArrayRef<Metadata *> Properties) {
5209 // Nothing to do if no property to attach.
5210 if (Properties.empty())
5211 return;
5212
5213 LLVMContext &Ctx = BB->getContext();
5214 SmallVector<Metadata *> NewProperties;
5215 NewProperties.push_back(nullptr);
5216
5217 // If the basic block already has metadata, prepend it to the new metadata.
5218 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
5219 if (Existing)
5220 append_range(NewProperties, drop_begin(Existing->operands(), 1));
5221
5222 append_range(NewProperties, Properties);
5223 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
5224 BasicBlockID->replaceOperandWith(0, BasicBlockID);
5225
5226 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
5227}
5228
5229/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
5230/// loop already has metadata, the loop properties are appended.
5232 ArrayRef<Metadata *> Properties) {
5233 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
5234
5235 // Attach metadata to the loop's latch
5236 BasicBlock *Latch = Loop->getLatch();
5237 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
5238 addBasicBlockMetadata(Latch, Properties);
5239}
5240
5241/// Attach llvm.access.group metadata to the memref instructions of \p Block
5242static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
5243 LoopInfo &LI) {
5244 for (Instruction &I : *Block) {
5245 if (I.mayReadOrWriteMemory()) {
5246 // TODO: This instruction may already have access group from
5247 // other pragmas e.g. #pragma clang loop vectorize. Append
5248 // so that the existing metadata is not overwritten.
5249 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
5250 }
5251 }
5252}
5253
5257 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5258 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
5259}
5260
5264 Loop, {
5265 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5266 });
5267}
5268
5269void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
5270 Value *IfCond, ValueToValueMapTy &VMap,
5271 const Twine &NamePrefix) {
5272 Function *F = CanonicalLoop->getFunction();
5273
5274 // Define where if branch should be inserted
5275 Instruction *SplitBefore = CanonicalLoop->getPreheader()->getTerminator();
5276
5277 // TODO: We should not rely on pass manager. Currently we use pass manager
5278 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5279 // object. We should have a method which returns all blocks between
5280 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5282 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5283 FAM.registerPass([]() { return LoopAnalysis(); });
5284 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5285
5286 // Get the loop which needs to be cloned
5287 LoopAnalysis LIA;
5288 LoopInfo &&LI = LIA.run(*F, FAM);
5289 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5290
5291 // Create additional blocks for the if statement
5292 BasicBlock *Head = SplitBefore->getParent();
5293 Instruction *HeadOldTerm = Head->getTerminator();
5294 llvm::LLVMContext &C = Head->getContext();
5296 C, NamePrefix + ".if.then", Head->getParent(), Head->getNextNode());
5298 C, NamePrefix + ".if.else", Head->getParent(), CanonicalLoop->getExit());
5299
5300 // Create if condition branch.
5301 Builder.SetInsertPoint(HeadOldTerm);
5302 Instruction *BrInstr =
5303 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
5304 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
5305 // Then block contains branch to omp loop which needs to be vectorized
5306 spliceBB(IP, ThenBlock, false);
5307 ThenBlock->replaceSuccessorsPhiUsesWith(Head, ThenBlock);
5308
5309 Builder.SetInsertPoint(ElseBlock);
5310
5311 // Clone loop for the else branch
5313
5314 VMap[CanonicalLoop->getPreheader()] = ElseBlock;
5315 for (BasicBlock *Block : L->getBlocks()) {
5316 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
5317 NewBB->moveBefore(CanonicalLoop->getExit());
5318 VMap[Block] = NewBB;
5319 NewBlocks.push_back(NewBB);
5320 }
5321 remapInstructionsInBlocks(NewBlocks, VMap);
5322 Builder.CreateBr(NewBlocks.front());
5323}
5324
5325unsigned
5327 const StringMap<bool> &Features) {
5328 if (TargetTriple.isX86()) {
5329 if (Features.lookup("avx512f"))
5330 return 512;
5331 else if (Features.lookup("avx"))
5332 return 256;
5333 return 128;
5334 }
5335 if (TargetTriple.isPPC())
5336 return 128;
5337 if (TargetTriple.isWasm())
5338 return 128;
5339 return 0;
5340}
5341
5343 MapVector<Value *, Value *> AlignedVars,
5344 Value *IfCond, OrderKind Order,
5345 ConstantInt *Simdlen, ConstantInt *Safelen) {
5347
5348 Function *F = CanonicalLoop->getFunction();
5349
5350 // TODO: We should not rely on pass manager. Currently we use pass manager
5351 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5352 // object. We should have a method which returns all blocks between
5353 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5355 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5356 FAM.registerPass([]() { return LoopAnalysis(); });
5357 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5358
5359 LoopAnalysis LIA;
5360 LoopInfo &&LI = LIA.run(*F, FAM);
5361
5362 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5363 if (AlignedVars.size()) {
5365 for (auto &AlignedItem : AlignedVars) {
5366 Value *AlignedPtr = AlignedItem.first;
5367 Value *Alignment = AlignedItem.second;
5368 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
5369 Builder.SetInsertPoint(loadInst->getNextNode());
5370 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
5371 Alignment);
5372 }
5373 Builder.restoreIP(IP);
5374 }
5375
5376 if (IfCond) {
5377 ValueToValueMapTy VMap;
5378 createIfVersion(CanonicalLoop, IfCond, VMap, "simd");
5379 // Add metadata to the cloned loop which disables vectorization
5380 Value *MappedLatch = VMap.lookup(CanonicalLoop->getLatch());
5381 assert(MappedLatch &&
5382 "Cannot find value which corresponds to original loop latch");
5383 assert(isa<BasicBlock>(MappedLatch) &&
5384 "Cannot cast mapped latch block value to BasicBlock");
5385 BasicBlock *NewLatchBlock = dyn_cast<BasicBlock>(MappedLatch);
5386 ConstantAsMetadata *BoolConst =
5389 NewLatchBlock,
5390 {MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"),
5391 BoolConst})});
5392 }
5393
5394 SmallSet<BasicBlock *, 8> Reachable;
5395
5396 // Get the basic blocks from the loop in which memref instructions
5397 // can be found.
5398 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5399 // preferably without running any passes.
5400 for (BasicBlock *Block : L->getBlocks()) {
5401 if (Block == CanonicalLoop->getCond() ||
5402 Block == CanonicalLoop->getHeader())
5403 continue;
5404 Reachable.insert(Block);
5405 }
5406
5407 SmallVector<Metadata *> LoopMDList;
5408
5409 // In presence of finite 'safelen', it may be unsafe to mark all
5410 // the memory instructions parallel, because loop-carried
5411 // dependences of 'safelen' iterations are possible.
5412 // If clause order(concurrent) is specified then the memory instructions
5413 // are marked parallel even if 'safelen' is finite.
5414 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
5415 // Add access group metadata to memory-access instructions.
5416 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5417 for (BasicBlock *BB : Reachable)
5418 addSimdMetadata(BB, AccessGroup, LI);
5419 // TODO: If the loop has existing parallel access metadata, have
5420 // to combine two lists.
5421 LoopMDList.push_back(MDNode::get(
5422 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5423 }
5424
5425 // Use the above access group metadata to create loop level
5426 // metadata, which should be distinct for each loop.
5427 ConstantAsMetadata *BoolConst =
5429 LoopMDList.push_back(MDNode::get(
5430 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
5431
5432 if (Simdlen || Safelen) {
5433 // If both simdlen and safelen clauses are specified, the value of the
5434 // simdlen parameter must be less than or equal to the value of the safelen
5435 // parameter. Therefore, use safelen only in the absence of simdlen.
5436 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
5437 LoopMDList.push_back(
5438 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
5439 ConstantAsMetadata::get(VectorizeWidth)}));
5440 }
5441
5442 addLoopMetadata(CanonicalLoop, LoopMDList);
5443}
5444
5445/// Create the TargetMachine object to query the backend for optimization
5446/// preferences.
5447///
5448/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
5449/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
5450/// needed for the LLVM pass pipline. We use some default options to avoid
5451/// having to pass too many settings from the frontend that probably do not
5452/// matter.
5453///
5454/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
5455/// method. If we are going to use TargetMachine for more purposes, especially
5456/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
5457/// might become be worth requiring front-ends to pass on their TargetMachine,
5458/// or at least cache it between methods. Note that while fontends such as Clang
5459/// have just a single main TargetMachine per translation unit, "target-cpu" and
5460/// "target-features" that determine the TargetMachine are per-function and can
5461/// be overrided using __attribute__((target("OPTIONS"))).
5462static std::unique_ptr<TargetMachine>
5464 Module *M = F->getParent();
5465
5466 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
5467 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
5468 const std::string &Triple = M->getTargetTriple();
5469
5470 std::string Error;
5472 if (!TheTarget)
5473 return {};
5474
5476 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
5477 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
5478 /*CodeModel=*/std::nullopt, OptLevel));
5479}
5480
5481/// Heuristically determine the best-performant unroll factor for \p CLI. This
5482/// depends on the target processor. We are re-using the same heuristics as the
5483/// LoopUnrollPass.
5485 Function *F = CLI->getFunction();
5486
5487 // Assume the user requests the most aggressive unrolling, even if the rest of
5488 // the code is optimized using a lower setting.
5490 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
5491
5493 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
5494 FAM.registerPass([]() { return AssumptionAnalysis(); });
5495 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5496 FAM.registerPass([]() { return LoopAnalysis(); });
5497 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
5498 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5499 TargetIRAnalysis TIRA;
5500 if (TM)
5501 TIRA = TargetIRAnalysis(
5502 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
5503 FAM.registerPass([&]() { return TIRA; });
5504
5505 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
5507 ScalarEvolution &&SE = SEA.run(*F, FAM);
5509 DominatorTree &&DT = DTA.run(*F, FAM);
5510 LoopAnalysis LIA;
5511 LoopInfo &&LI = LIA.run(*F, FAM);
5513 AssumptionCache &&AC = ACT.run(*F, FAM);
5515
5516 Loop *L = LI.getLoopFor(CLI->getHeader());
5517 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
5518
5520 L, SE, TTI,
5521 /*BlockFrequencyInfo=*/nullptr,
5522 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
5523 /*UserThreshold=*/std::nullopt,
5524 /*UserCount=*/std::nullopt,
5525 /*UserAllowPartial=*/true,
5526 /*UserAllowRuntime=*/true,
5527 /*UserUpperBound=*/std::nullopt,
5528 /*UserFullUnrollMaxCount=*/std::nullopt);
5529
5530 UP.Force = true;
5531
5532 // Account for additional optimizations taking place before the LoopUnrollPass
5533 // would unroll the loop.
5536
5537 // Use normal unroll factors even if the rest of the code is optimized for
5538 // size.
5541
5542 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
5543 << " Threshold=" << UP.Threshold << "\n"
5544 << " PartialThreshold=" << UP.PartialThreshold << "\n"
5545 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
5546 << " PartialOptSizeThreshold="
5547 << UP.PartialOptSizeThreshold << "\n");
5548
5549 // Disable peeling.
5552 /*UserAllowPeeling=*/false,
5553 /*UserAllowProfileBasedPeeling=*/false,
5554 /*UnrollingSpecficValues=*/false);
5555
5557 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
5558
5559 // Assume that reads and writes to stack variables can be eliminated by
5560 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
5561 // size.
5562 for (BasicBlock *BB : L->blocks()) {
5563 for (Instruction &I : *BB) {
5564 Value *Ptr;
5565 if (auto *Load = dyn_cast<LoadInst>(&I)) {
5566 Ptr = Load->getPointerOperand();
5567 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5568 Ptr = Store->getPointerOperand();
5569 } else
5570 continue;
5571
5572 Ptr = Ptr->stripPointerCasts();
5573
5574 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
5575 if (Alloca->getParent() == &F->getEntryBlock())
5576 EphValues.insert(&I);
5577 }
5578 }
5579 }
5580
5581 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
5582
5583 // Loop is not unrollable if the loop contains certain instructions.
5584 if (!UCE.canUnroll()) {
5585 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
5586 return 1;
5587 }
5588
5589 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
5590 << "\n");
5591
5592 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
5593 // be able to use it.
5594 int TripCount = 0;
5595 int MaxTripCount = 0;
5596 bool MaxOrZero = false;
5597 unsigned TripMultiple = 0;
5598
5599 bool UseUpperBound = false;
5600 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
5601 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
5602 UseUpperBound);
5603 unsigned Factor = UP.Count;
5604 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
5605
5606 // This function returns 1 to signal to not unroll a loop.
5607 if (Factor == 0)
5608 return 1;
5609 return Factor;
5610}
5611
5613 int32_t Factor,
5614 CanonicalLoopInfo **UnrolledCLI) {
5615 assert(Factor >= 0 && "Unroll factor must not be negative");
5616
5617 Function *F = Loop->getFunction();
5618 LLVMContext &Ctx = F->getContext();
5619
5620 // If the unrolled loop is not used for another loop-associated directive, it
5621 // is sufficient to add metadata for the LoopUnrollPass.
5622 if (!UnrolledCLI) {
5623 SmallVector<Metadata *, 2> LoopMetadata;
5624 LoopMetadata.push_back(
5625 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
5626
5627 if (Factor >= 1) {
5629 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
5630 LoopMetadata.push_back(MDNode::get(
5631 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
5632 }
5633
5634 addLoopMetadata(Loop, LoopMetadata);
5635 return;
5636 }
5637
5638 // Heuristically determine the unroll factor.
5639 if (Factor == 0)
5641
5642 // No change required with unroll factor 1.
5643 if (Factor == 1) {
5644 *UnrolledCLI = Loop;
5645 return;
5646 }
5647
5648 assert(Factor >= 2 &&
5649 "unrolling only makes sense with a factor of 2 or larger");
5650
5651 Type *IndVarTy = Loop->getIndVarType();
5652
5653 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
5654 // unroll the inner loop.
5655 Value *FactorVal =
5656 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
5657 /*isSigned=*/false));
5658 std::vector<CanonicalLoopInfo *> LoopNest =
5659 tileLoops(DL, {Loop}, {FactorVal});
5660 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
5661 *UnrolledCLI = LoopNest[0];
5662 CanonicalLoopInfo *InnerLoop = LoopNest[1];
5663
5664 // LoopUnrollPass can only fully unroll loops with constant trip count.
5665 // Unroll by the unroll factor with a fallback epilog for the remainder
5666 // iterations if necessary.
5668 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
5670 InnerLoop,
5671 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5673 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
5674
5675#ifndef NDEBUG
5676 (*UnrolledCLI)->assertOK();
5677#endif
5678}
5679
5682 llvm::Value *BufSize, llvm::Value *CpyBuf,
5683 llvm::Value *CpyFn, llvm::Value *DidIt) {
5684 if (!updateToLocation(Loc))
5685 return Loc.IP;
5686
5687 uint32_t SrcLocStrSize;
5688 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5689 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5690 Value *ThreadId = getOrCreateThreadID(Ident);
5691
5692 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
5693
5694 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
5695
5696 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
5697 Builder.CreateCall(Fn, Args);
5698
5699 return Builder.saveIP();
5700}
5701
5703 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5704 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
5706
5707 if (!updateToLocation(Loc))
5708 return Loc.IP;
5709
5710 // If needed allocate and initialize `DidIt` with 0.
5711 // DidIt: flag variable: 1=single thread; 0=not single thread.
5712 llvm::Value *DidIt = nullptr;
5713 if (!CPVars.empty()) {
5716 }
5717
5718 Directive OMPD = Directive::OMPD_single;
5719 uint32_t SrcLocStrSize;
5720 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5721 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5722 Value *ThreadId = getOrCreateThreadID(Ident);
5723 Value *Args[] = {Ident, ThreadId};
5724
5725 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
5726 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
5727
5728 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
5729 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5730
5731 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
5732 if (Error Err = FiniCB(IP))
5733 return Err;
5734
5735 // The thread that executes the single region must set `DidIt` to 1.
5736 // This is used by __kmpc_copyprivate, to know if the caller is the
5737 // single thread or not.
5738 if (DidIt)
5740
5741 return Error::success();
5742 };
5743
5744 // generates the following:
5745 // if (__kmpc_single()) {
5746 // .... single region ...
5747 // __kmpc_end_single
5748 // }
5749 // __kmpc_copyprivate
5750 // __kmpc_barrier
5751
5752 InsertPointOrErrorTy AfterIP =
5753 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
5754 /*Conditional*/ true,
5755 /*hasFinalize*/ true);
5756 if (!AfterIP)
5757 return AfterIP.takeError();
5758
5759 if (DidIt) {
5760 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
5761 // NOTE BufSize is currently unused, so just pass 0.
5763 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
5764 CPFuncs[I], DidIt);
5765 // NOTE __kmpc_copyprivate already inserts a barrier
5766 } else if (!IsNowait) {
5767 InsertPointOrErrorTy AfterIP =
5769 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
5770 /* CheckCancelFlag */ false);
5771 if (!AfterIP)
5772 return AfterIP.takeError();
5773 }
5774 return Builder.saveIP();
5775}
5776
5778 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5779 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
5780
5781 if (!updateToLocation(Loc))
5782 return Loc.IP;
5783
5784 Directive OMPD = Directive::OMPD_critical;
5785 uint32_t SrcLocStrSize;
5786 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5787 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5788 Value *ThreadId = getOrCreateThreadID(Ident);
5789 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
5790 Value *Args[] = {Ident, ThreadId, LockVar};
5791
5792 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
5793 Function *RTFn = nullptr;
5794 if (HintInst) {
5795 // Add Hint to entry Args and create call
5796 EnterArgs.push_back(HintInst);
5797 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
5798 } else {
5799 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
5800 }
5801 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
5802
5803 Function *ExitRTLFn =
5804 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
5805 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5806
5807 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5808 /*Conditional*/ false, /*hasFinalize*/ true);
5809}
5810
5813 InsertPointTy AllocaIP, unsigned NumLoops,
5814 ArrayRef<llvm::Value *> StoreValues,
5815 const Twine &Name, bool IsDependSource) {
5816 assert(
5817 llvm::all_of(StoreValues,
5818 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
5819 "OpenMP runtime requires depend vec with i64 type");
5820
5821 if (!updateToLocation(Loc))
5822 return Loc.IP;
5823
5824 // Allocate space for vector and generate alloc instruction.
5825 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
5826 Builder.restoreIP(AllocaIP);
5827 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
5828 ArgsBase->setAlignment(Align(8));
5829 Builder.restoreIP(Loc.IP);
5830
5831 // Store the index value with offset in depend vector.
5832 for (unsigned I = 0; I < NumLoops; ++I) {
5833 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
5834 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
5835 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
5836 STInst->setAlignment(Align(8));
5837 }
5838
5839 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
5840 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
5841
5842 uint32_t SrcLocStrSize;
5843 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5844 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5845 Value *ThreadId = getOrCreateThreadID(Ident);
5846 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
5847
5848 Function *RTLFn = nullptr;
5849 if (IsDependSource)
5850 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
5851 else
5852 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
5853 Builder.CreateCall(RTLFn, Args);
5854
5855 return Builder.saveIP();
5856}
5857
5859 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5860 FinalizeCallbackTy FiniCB, bool IsThreads) {
5861 if (!updateToLocation(Loc))
5862 return Loc.IP;
5863
5864 Directive OMPD = Directive::OMPD_ordered;
5865 Instruction *EntryCall = nullptr;
5866 Instruction *ExitCall = nullptr;
5867
5868 if (IsThreads) {
5869 uint32_t SrcLocStrSize;
5870 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5871 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5872 Value *ThreadId = getOrCreateThreadID(Ident);
5873 Value *Args[] = {Ident, ThreadId};
5874
5875 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
5876 EntryCall = Builder.CreateCall(EntryRTLFn, Args);
5877
5878 Function *ExitRTLFn =
5879 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
5880 ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5881 }
5882
5883 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5884 /*Conditional*/ false, /*hasFinalize*/ true);
5885}
5886
5887OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
5888 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
5889 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
5890 bool HasFinalize, bool IsCancellable) {
5891
5892 if (HasFinalize)
5893 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
5894
5895 // Create inlined region's entry and body blocks, in preparation
5896 // for conditional creation
5897 BasicBlock *EntryBB = Builder.GetInsertBlock();
5898 Instruction *SplitPos = EntryBB->getTerminator();
5899 if (!isa_and_nonnull<BranchInst>(SplitPos))
5900 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
5901 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
5902 BasicBlock *FiniBB =
5903 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
5904
5906 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
5907
5908 // generate body
5909 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
5910 /* CodeGenIP */ Builder.saveIP()))
5911 return Err;
5912
5913 // emit exit call and do any needed finalization.
5914 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
5915 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
5916 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
5917 "Unexpected control flow graph state!!");
5918 InsertPointOrErrorTy AfterIP =
5919 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
5920 if (!AfterIP)
5921 return AfterIP.takeError();
5922 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
5923 "Unexpected Control Flow State!");
5925
5926 // If we are skipping the region of a non conditional, remove the exit
5927 // block, and clear the builder's insertion point.
5928 assert(SplitPos->getParent() == ExitBB &&
5929 "Unexpected Insertion point location!");
5930 auto merged = MergeBlockIntoPredecessor(ExitBB);
5931 BasicBlock *ExitPredBB = SplitPos->getParent();
5932 auto InsertBB = merged ? ExitPredBB : ExitBB;
5933 if (!isa_and_nonnull<BranchInst>(SplitPos))
5934 SplitPos->eraseFromParent();
5935 Builder.SetInsertPoint(InsertBB);
5936
5937 return Builder.saveIP();
5938}
5939
5940OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
5941 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
5942 // if nothing to do, Return current insertion point.
5943 if (!Conditional || !EntryCall)
5944 return Builder.saveIP();
5945
5946 BasicBlock *EntryBB = Builder.GetInsertBlock();
5947 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
5948 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
5949 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
5950
5951 // Emit thenBB and set the Builder's insertion point there for
5952 // body generation next. Place the block after the current block.
5953 Function *CurFn = EntryBB->getParent();
5954 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
5955
5956 // Move Entry branch to end of ThenBB, and replace with conditional
5957 // branch (If-stmt)
5958 Instruction *EntryBBTI = EntryBB->getTerminator();
5959 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
5960 EntryBBTI->removeFromParent();
5962 Builder.Insert(EntryBBTI);
5963 UI->eraseFromParent();
5965
5966 // return an insertion point to ExitBB.
5967 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
5968}
5969
5970OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
5971 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
5972 bool HasFinalize) {
5973
5974 Builder.restoreIP(FinIP);
5975
5976 // If there is finalization to do, emit it before the exit call
5977 if (HasFinalize) {
5978 assert(!FinalizationStack.empty() &&
5979 "Unexpected finalization stack state!");
5980
5981 FinalizationInfo Fi = FinalizationStack.pop_back_val();
5982 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
5983
5984 if (Error Err = Fi.FiniCB(FinIP))
5985 return Err;
5986
5987 BasicBlock *FiniBB = FinIP.getBlock();
5988 Instruction *FiniBBTI = FiniBB->getTerminator();
5989
5990 // set Builder IP for call creation
5991 Builder.SetInsertPoint(FiniBBTI);
5992 }
5993
5994 if (!ExitCall)
5995 return Builder.saveIP();
5996
5997 // place the Exitcall as last instruction before Finalization block terminator
5998 ExitCall->removeFromParent();
5999 Builder.Insert(ExitCall);
6000
6001 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
6002 ExitCall->getIterator());
6003}
6004
6006 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
6007 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
6008 if (!IP.isSet())
6009 return IP;
6010
6012
6013 // creates the following CFG structure
6014 // OMP_Entry : (MasterAddr != PrivateAddr)?
6015 // F T
6016 // | \
6017 // | copin.not.master
6018 // | /
6019 // v /
6020 // copyin.not.master.end
6021 // |
6022 // v
6023 // OMP.Entry.Next
6024
6025 BasicBlock *OMP_Entry = IP.getBlock();
6026 Function *CurFn = OMP_Entry->getParent();
6027 BasicBlock *CopyBegin =
6028 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
6029 BasicBlock *CopyEnd = nullptr;
6030
6031 // If entry block is terminated, split to preserve the branch to following
6032 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
6033 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
6034 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
6035 "copyin.not.master.end");
6036 OMP_Entry->getTerminator()->eraseFromParent();
6037 } else {
6038 CopyEnd =
6039 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
6040 }
6041
6042 Builder.SetInsertPoint(OMP_Entry);
6043 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
6044 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
6045 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
6046 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
6047
6048 Builder.SetInsertPoint(CopyBegin);
6049 if (BranchtoEnd)
6051
6052 return Builder.saveIP();
6053}
6054
6056 Value *Size, Value *Allocator,
6057 std::string Name) {
6059 updateToLocation(Loc);
6060
6061 uint32_t SrcLocStrSize;
6062 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6063 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6064 Value *ThreadId = getOrCreateThreadID(Ident);
6065 Value *Args[] = {ThreadId, Size, Allocator};
6066
6067 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
6068
6069 return Builder.CreateCall(Fn, Args, Name);
6070}
6071
6073 Value *Addr, Value *Allocator,
6074 std::string Name) {
6076 updateToLocation(Loc);
6077
6078 uint32_t SrcLocStrSize;
6079 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6080 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6081 Value *ThreadId = getOrCreateThreadID(Ident);
6082 Value *Args[] = {ThreadId, Addr, Allocator};
6083 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
6084 return Builder.CreateCall(Fn, Args, Name);
6085}
6086
6088 const LocationDescription &Loc, Value *InteropVar,
6089 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
6090 Value *DependenceAddress, bool HaveNowaitClause) {
6092 updateToLocation(Loc);
6093
6094 uint32_t SrcLocStrSize;
6095 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6096 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6097 Value *ThreadId = getOrCreateThreadID(Ident);
6098 if (Device == nullptr)
6100 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
6101 if (NumDependences == nullptr) {
6102 NumDependences = ConstantInt::get(Int32, 0);
6103 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6104 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6105 }
6106 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6107 Value *Args[] = {
6108 Ident, ThreadId, InteropVar, InteropTypeVal,
6109 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
6110
6111 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
6112
6113 return Builder.CreateCall(Fn, Args);
6114}
6115
6117 const LocationDescription &Loc, Value *InteropVar, Value *Device,
6118 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
6120 updateToLocation(Loc);
6121
6122 uint32_t SrcLocStrSize;
6123 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6124 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6125 Value *ThreadId = getOrCreateThreadID(Ident);
6126 if (Device == nullptr)
6128 if (NumDependences == nullptr) {
6129 NumDependences = ConstantInt::get(Int32, 0);
6130 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6131 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6132 }
6133 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6134 Value *Args[] = {
6135 Ident, ThreadId, InteropVar, Device,
6136 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6137
6138 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
6139
6140 return Builder.CreateCall(Fn, Args);
6141}
6142
6144 Value *InteropVar, Value *Device,
6145 Value *NumDependences,
6146 Value *DependenceAddress,
6147 bool HaveNowaitClause) {
6149 updateToLocation(Loc);
6150 uint32_t SrcLocStrSize;
6151 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6152 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6153 Value *ThreadId = getOrCreateThreadID(Ident);
6154 if (Device == nullptr)
6156 if (NumDependences == nullptr) {
6157 NumDependences = ConstantInt::get(Int32, 0);
6158 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6159 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6160 }
6161 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6162 Value *Args[] = {
6163 Ident, ThreadId, InteropVar, Device,
6164 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6165
6166 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
6167
6168 return Builder.CreateCall(Fn, Args);
6169}
6170
6172 const LocationDescription &Loc, llvm::Value *Pointer,
6175 updateToLocation(Loc);
6176
6177 uint32_t SrcLocStrSize;
6178 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6179 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6180 Value *ThreadId = getOrCreateThreadID(Ident);
6181 Constant *ThreadPrivateCache =
6182 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
6183 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
6184
6185 Function *Fn =
6186 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
6187
6188 return Builder.CreateCall(Fn, Args);
6189}
6190
6192 const LocationDescription &Loc,
6194 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
6195 "expected num_threads and num_teams to be specified");
6196
6197 if (!updateToLocation(Loc))
6198 return Loc.IP;
6199
6200 uint32_t SrcLocStrSize;
6201 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6202 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6203 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
6204 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
6205 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
6206 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
6207 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
6208
6209 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
6210 Function *Kernel = DebugKernelWrapper;
6211
6212 // We need to strip the debug prefix to get the correct kernel name.
6213 StringRef KernelName = Kernel->getName();
6214 const std::string DebugPrefix = "_debug__";
6215 if (KernelName.ends_with(DebugPrefix)) {
6216 KernelName = KernelName.drop_back(DebugPrefix.length());
6217 Kernel = M.getFunction(KernelName);
6218 assert(Kernel && "Expected the real kernel to exist");
6219 }
6220
6221 // Manifest the launch configuration in the metadata matching the kernel
6222 // environment.
6223 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
6224 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
6225
6226 // If MaxThreads not set, select the maximum between the default workgroup
6227 // size and the MinThreads value.
6228 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
6229 if (MaxThreadsVal < 0)
6230 MaxThreadsVal = std::max(
6231 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), Attrs.MinThreads);
6232
6233 if (MaxThreadsVal > 0)
6234 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
6235
6236 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
6238 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
6239 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
6240 Constant *ReductionDataSize = ConstantInt::getSigned(Int32, 0);
6241 Constant *ReductionBufferLength = ConstantInt::getSigned(Int32, 0);
6242
6244 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
6245 const DataLayout &DL = Fn->getDataLayout();
6246
6247 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
6248 Constant *DynamicEnvironmentInitializer =
6249 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
6250 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
6251 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
6252 DynamicEnvironmentInitializer, DynamicEnvironmentName,
6253 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6254 DL.getDefaultGlobalsAddressSpace());
6255 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6256
6257 Constant *DynamicEnvironment =
6258 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
6259 ? DynamicEnvironmentGV
6260 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
6261 DynamicEnvironmentPtr);
6262
6263 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
6264 ConfigurationEnvironment, {
6265 UseGenericStateMachineVal,
6266 MayUseNestedParallelismVal,
6267 IsSPMDVal,
6268 MinThreads,
6269 MaxThreads,
6270 MinTeams,
6271 MaxTeams,
6272 ReductionDataSize,
6273 ReductionBufferLength,
6274 });
6275 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
6276 KernelEnvironment, {
6277 ConfigurationEnvironmentInitializer,
6278 Ident,
6279 DynamicEnvironment,
6280 });
6281 std::string KernelEnvironmentName =
6282 (KernelName + "_kernel_environment").str();
6283 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
6284 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
6285 KernelEnvironmentInitializer, KernelEnvironmentName,
6286 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6287 DL.getDefaultGlobalsAddressSpace());
6288 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6289
6290 Constant *KernelEnvironment =
6291 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
6292 ? KernelEnvironmentGV
6293 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
6294 KernelEnvironmentPtr);
6295 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
6296 CallInst *ThreadKind =
6297 Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
6298
6299 Value *ExecUserCode = Builder.CreateICmpEQ(
6300 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
6301 "exec_user_code");
6302
6303 // ThreadKind = __kmpc_target_init(...)
6304 // if (ThreadKind == -1)
6305 // user_code
6306 // else
6307 // return;
6308
6309 auto *UI = Builder.CreateUnreachable();
6310 BasicBlock *CheckBB = UI->getParent();
6311 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
6312
6313 BasicBlock *WorkerExitBB = BasicBlock::Create(
6314 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
6315 Builder.SetInsertPoint(WorkerExitBB);
6317
6318 auto *CheckBBTI = CheckBB->getTerminator();
6319 Builder.SetInsertPoint(CheckBBTI);
6320 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
6321
6322 CheckBBTI->eraseFromParent();
6323 UI->eraseFromParent();
6324
6325 // Continue in the "user_code" block, see diagram above and in
6326 // openmp/libomptarget/deviceRTLs/common/include/target.h .
6327 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
6328}
6329
6331 int32_t TeamsReductionDataSize,
6332 int32_t TeamsReductionBufferLength) {
6333 if (!updateToLocation(Loc))
6334 return;
6335
6337 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
6338
6339 Builder.CreateCall(Fn, {});
6340
6341 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
6342 return;
6343
6345 // We need to strip the debug prefix to get the correct kernel name.
6346 StringRef KernelName = Kernel->getName();
6347 const std::string DebugPrefix = "_debug__";
6348 if (KernelName.ends_with(DebugPrefix))
6349 KernelName = KernelName.drop_back(DebugPrefix.length());
6350 auto *KernelEnvironmentGV =
6351 M.getNamedGlobal((KernelName + "_kernel_environment").str());
6352 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
6353 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
6354 auto *NewInitializer = ConstantFoldInsertValueInstruction(
6355 KernelEnvironmentInitializer,
6356 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
6357 NewInitializer = ConstantFoldInsertValueInstruction(
6358 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
6359 {0, 8});
6360 KernelEnvironmentGV->setInitializer(NewInitializer);
6361}
6362
6364 Module &M = *Kernel.getParent();
6365 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6366 for (auto *Op : MD->operands()) {
6367 if (Op->getNumOperands() != 3)
6368 continue;
6369 auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
6370 if (!KernelOp || KernelOp->getValue() != &Kernel)
6371 continue;
6372 auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
6373 if (!Prop || Prop->getString() != Name)
6374 continue;
6375 return Op;
6376 }
6377 return nullptr;
6378}
6379
6381 bool Min) {
6382 // Update the "maxntidx" metadata for NVIDIA, or add it.
6383 MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name);
6384 if (ExistingOp) {
6385 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
6386 int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
6387 ExistingOp->replaceOperandWith(
6388 2, ConstantAsMetadata::get(ConstantInt::get(
6389 OldVal->getValue()->getType(),
6390 Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value))));
6391 } else {
6392 LLVMContext &Ctx = Kernel.getContext();
6394 MDString::get(Ctx, Name),
6396 ConstantInt::get(Type::getInt32Ty(Ctx), Value))};
6397 // Append metadata to nvvm.annotations
6398 Module &M = *Kernel.getParent();
6399 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6400 MD->addOperand(MDNode::get(Ctx, MDVals));
6401 }
6402}
6403
6404std::pair<int32_t, int32_t>
6406 int32_t ThreadLimit =
6407 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
6408
6409 if (T.isAMDGPU()) {
6410 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
6411 if (!Attr.isValid() || !Attr.isStringAttribute())
6412 return {0, ThreadLimit};
6413 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
6414 int32_t LB, UB;
6415 if (!llvm::to_integer(UBStr, UB, 10))
6416 return {0, ThreadLimit};
6417 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
6418 if (!llvm::to_integer(LBStr, LB, 10))
6419 return {0, UB};
6420 return {LB, UB};
6421 }
6422
6423 if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) {
6424 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
6425 int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
6426 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
6427 }
6428 return {0, ThreadLimit};
6429}
6430
6432 Function &Kernel, int32_t LB,
6433 int32_t UB) {
6434 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
6435
6436 if (T.isAMDGPU()) {
6437 Kernel.addFnAttr("amdgpu-flat-work-group-size",
6438 llvm::utostr(LB) + "," + llvm::utostr(UB));
6439 return;
6440 }
6441
6442 updateNVPTXMetadata(Kernel, "maxntidx", UB, true);
6443}
6444
6445std::pair<int32_t, int32_t>
6447 // TODO: Read from backend annotations if available.
6448 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
6449}
6450
6452 int32_t LB, int32_t UB) {
6453 if (T.isNVPTX())
6454 if (UB > 0)
6455 updateNVPTXMetadata(Kernel, "maxclusterrank", UB, true);
6456 if (T.isAMDGPU())
6457 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
6458
6459 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
6460}
6461
6462void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
6463 Function *OutlinedFn) {
6464 if (Config.isTargetDevice()) {
6466 // TODO: Determine if DSO local can be set to true.
6467 OutlinedFn->setDSOLocal(false);
6469 if (T.isAMDGCN())
6471 }
6472}
6473
6474Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
6475 StringRef EntryFnIDName) {
6476 if (Config.isTargetDevice()) {
6477 assert(OutlinedFn && "The outlined function must exist if embedded");
6478 return OutlinedFn;
6479 }
6480
6481 return new GlobalVariable(
6482 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
6483 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
6484}
6485
6486Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
6487 StringRef EntryFnName) {
6488 if (OutlinedFn)
6489 return OutlinedFn;
6490
6491 assert(!M.getGlobalVariable(EntryFnName, true) &&
6492 "Named kernel already exists?");
6493 return new GlobalVariable(
6494 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
6495 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
6496}
6497
6499 TargetRegionEntryInfo &EntryInfo,
6500 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
6501 Function *&OutlinedFn, Constant *&OutlinedFnID) {
6502
6503 SmallString<64> EntryFnName;
6504 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
6505
6507 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
6508 if (!CBResult)
6509 return CBResult.takeError();
6510 OutlinedFn = *CBResult;
6511 } else {
6512 OutlinedFn = nullptr;
6513 }
6514
6515 // If this target outline function is not an offload entry, we don't need to
6516 // register it. This may be in the case of a false if clause, or if there are
6517 // no OpenMP targets.
6518 if (!IsOffloadEntry)
6519 return Error::success();
6520
6521 std::string EntryFnIDName =
6523 ? std::string(EntryFnName)
6524 : createPlatformSpecificName({EntryFnName, "region_id"});
6525
6526 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
6527 EntryFnName, EntryFnIDName);
6528 return Error::success();
6529}
6530
6532 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
6533 StringRef EntryFnName, StringRef EntryFnIDName) {
6534 if (OutlinedFn)
6535 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
6536 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
6537 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
6539 EntryInfo, EntryAddr, OutlinedFnID,
6541 return OutlinedFnID;
6542}
6543
6545 const LocationDescription &Loc, InsertPointTy AllocaIP,
6546 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
6547 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
6548 omp::RuntimeFunction *MapperFunc,
6550 BodyGenTy BodyGenType)>
6551 BodyGenCB,
6552 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
6553 function_ref<Value *(unsigned int)> CustomMapperCB, Value *SrcLocInfo) {
6554 if (!updateToLocation(Loc))
6555 return InsertPointTy();
6556
6557 Builder.restoreIP(CodeGenIP);
6558 // Disable TargetData CodeGen on Device pass.
6559 if (Config.IsTargetDevice.value_or(false)) {
6560 if (BodyGenCB) {
6561 InsertPointOrErrorTy AfterIP =
6562 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
6563 if (!AfterIP)
6564 return AfterIP.takeError();
6565 Builder.restoreIP(*AfterIP);
6566 }
6567 return Builder.saveIP();
6568 }
6569
6570 bool IsStandAlone = !BodyGenCB;
6571 MapInfosTy *MapInfo;
6572 // Generate the code for the opening of the data environment. Capture all the
6573 // arguments of the runtime call by reference because they are used in the
6574 // closing of the region.
6575 auto BeginThenGen = [&](InsertPointTy AllocaIP,
6576 InsertPointTy CodeGenIP) -> Error {
6577 MapInfo = &GenMapInfoCB(Builder.saveIP());
6578 emitOffloadingArrays(AllocaIP, Builder.saveIP(), *MapInfo, Info,
6579 /*IsNonContiguous=*/true, DeviceAddrCB,
6580 CustomMapperCB);
6581
6582 TargetDataRTArgs RTArgs;
6584
6585 // Emit the number of elements in the offloading arrays.
6586 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
6587
6588 // Source location for the ident struct
6589 if (!SrcLocInfo) {
6590 uint32_t SrcLocStrSize;
6591 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6592 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6593 }
6594
6595 SmallVector<llvm::Value *, 13> OffloadingArgs = {
6596 SrcLocInfo, DeviceID,
6597 PointerNum, RTArgs.BasePointersArray,
6598 RTArgs.PointersArray, RTArgs.SizesArray,
6599 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
6600 RTArgs.MappersArray};
6601
6602 if (IsStandAlone) {
6603 assert(MapperFunc && "MapperFunc missing for standalone target data");
6604
6605 auto TaskBodyCB = [&](Value *, Value *,
6607 if (Info.HasNoWait) {
6608 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
6612 }
6613
6615 OffloadingArgs);
6616
6617 if (Info.HasNoWait) {
6618 BasicBlock *OffloadContBlock =
6619 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
6621 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
6623 }
6624 return Error::success();
6625 };
6626
6627 bool RequiresOuterTargetTask = Info.HasNoWait;
6628 if (!RequiresOuterTargetTask)
6629 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
6630 /*TargetTaskAllocaIP=*/{}));
6631 else
6632 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
6633 /*Dependencies=*/{}, Info.HasNoWait));
6634 } else {
6635 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
6636 omp::OMPRTL___tgt_target_data_begin_mapper);
6637
6638 Builder.CreateCall(BeginMapperFunc, OffloadingArgs);
6639
6640 for (auto DeviceMap : Info.DevicePtrInfoMap) {
6641 if (isa<AllocaInst>(DeviceMap.second.second)) {
6642 auto *LI =
6643 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
6644 Builder.CreateStore(LI, DeviceMap.second.second);
6645 }
6646 }
6647
6648 // If device pointer privatization is required, emit the body of the
6649 // region here. It will have to be duplicated: with and without
6650 // privatization.
6651 InsertPointOrErrorTy AfterIP =
6652 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
6653 if (!AfterIP)
6654 return AfterIP.takeError();
6655 Builder.restoreIP(*AfterIP);
6656 }
6657 return Error::success();
6658 };
6659
6660 // If we need device pointer privatization, we need to emit the body of the
6661 // region with no privatization in the 'else' branch of the conditional.
6662 // Otherwise, we don't have to do anything.
6663 auto BeginElseGen = [&](InsertPointTy AllocaIP,
6664 InsertPointTy CodeGenIP) -> Error {
6665 InsertPointOrErrorTy AfterIP =
6666 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
6667 if (!AfterIP)
6668 return AfterIP.takeError();
6669 Builder.restoreIP(*AfterIP);
6670 return Error::success();
6671 };
6672
6673 // Generate code for the closing of the data region.
6674 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6675 TargetDataRTArgs RTArgs;
6676 Info.EmitDebug = !MapInfo->Names.empty();
6677 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
6678
6679 // Emit the number of elements in the offloading arrays.
6680 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
6681
6682 // Source location for the ident struct
6683 if (!SrcLocInfo) {
6684 uint32_t SrcLocStrSize;
6685 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6686 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6687 }
6688
6689 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
6690 PointerNum, RTArgs.BasePointersArray,
6691 RTArgs.PointersArray, RTArgs.SizesArray,
6692 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
6693 RTArgs.MappersArray};
6694 Function *EndMapperFunc =
6695 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
6696
6697 Builder.CreateCall(EndMapperFunc, OffloadingArgs);
6698 return Error::success();
6699 };
6700
6701 // We don't have to do anything to close the region if the if clause evaluates
6702 // to false.
6703 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6704 return Error::success();
6705 };
6706
6707 Error Err = [&]() -> Error {
6708 if (BodyGenCB) {
6709 Error Err = [&]() {
6710 if (IfCond)
6711 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
6712 return BeginThenGen(AllocaIP, Builder.saveIP());
6713 }();
6714
6715 if (Err)
6716 return Err;
6717
6718 // If we don't require privatization of device pointers, we emit the body
6719 // in between the runtime calls. This avoids duplicating the body code.
6720 InsertPointOrErrorTy AfterIP =
6721 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
6722 if (!AfterIP)
6723 return AfterIP.takeError();
6724 Builder.restoreIP(*AfterIP);
6725
6726 if (IfCond)
6727 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
6728 return EndThenGen(AllocaIP, Builder.saveIP());
6729 }
6730 if (IfCond)
6731 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
6732 return BeginThenGen(AllocaIP, Builder.saveIP());
6733 }();
6734
6735 if (Err)
6736 return Err;
6737
6738 return Builder.saveIP();
6739}
6740
6743 bool IsGPUDistribute) {
6744 assert((IVSize == 32 || IVSize == 64) &&
6745 "IV size is not compatible with the omp runtime");
6747 if (IsGPUDistribute)
6748 Name = IVSize == 32
6749 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
6750 : omp::OMPRTL___kmpc_distribute_static_init_4u)
6751 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
6752 : omp::OMPRTL___kmpc_distribute_static_init_8u);
6753 else
6754 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
6755 : omp::OMPRTL___kmpc_for_static_init_4u)
6756 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
6757 : omp::OMPRTL___kmpc_for_static_init_8u);
6758
6760}
6761
6763 bool IVSigned) {
6764 assert((IVSize == 32 || IVSize == 64) &&
6765 "IV size is not compatible with the omp runtime");
6766 RuntimeFunction Name = IVSize == 32
6767 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
6768 : omp::OMPRTL___kmpc_dispatch_init_4u)
6769 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
6770 : omp::OMPRTL___kmpc_dispatch_init_8u);
6771
6773}
6774
6776 bool IVSigned) {
6777 assert((IVSize == 32 || IVSize == 64) &&
6778 "IV size is not compatible with the omp runtime");
6779 RuntimeFunction Name = IVSize == 32
6780 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
6781 : omp::OMPRTL___kmpc_dispatch_next_4u)
6782 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
6783 : omp::OMPRTL___kmpc_dispatch_next_8u);
6784
6786}
6787
6789 bool IVSigned) {
6790 assert((IVSize == 32 || IVSize == 64) &&
6791 "IV size is not compatible with the omp runtime");
6792 RuntimeFunction Name = IVSize == 32
6793 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
6794 : omp::OMPRTL___kmpc_dispatch_fini_4u)
6795 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
6796 : omp::OMPRTL___kmpc_dispatch_fini_8u);
6797
6799}
6800
6802 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
6803}
6804
6806 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
6808 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
6811 SmallVector<Type *> ParameterTypes;
6812 if (OMPBuilder.Config.isTargetDevice()) {
6813 // Add the "implicit" runtime argument we use to provide launch specific
6814 // information for target devices.
6815 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
6816 ParameterTypes.push_back(Int8PtrTy);
6817
6818 // All parameters to target devices are passed as pointers
6819 // or i64. This assumes 64-bit address spaces/pointers.
6820 for (auto &Arg : Inputs)
6821 ParameterTypes.push_back(Arg->getType()->isPointerTy()
6822 ? Arg->getType()
6823 : Type::getInt64Ty(Builder.getContext()));
6824 } else {
6825 for (auto &Arg : Inputs)
6826 ParameterTypes.push_back(Arg->getType());
6827 }
6828
6829 auto BB = Builder.GetInsertBlock();
6830 auto M = BB->getModule();
6831 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
6832 /*isVarArg*/ false);
6833 auto Func =
6834 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
6835
6836 // Forward target-cpu and target-features function attributes from the
6837 // original function to the new outlined function.
6838 Function *ParentFn = Builder.GetInsertBlock()->getParent();
6839
6840 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
6841 if (TargetCpuAttr.isStringAttribute())
6842 Func->addFnAttr(TargetCpuAttr);
6843
6844 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
6845 if (TargetFeaturesAttr.isStringAttribute())
6846 Func->addFnAttr(TargetFeaturesAttr);
6847
6848 if (OMPBuilder.Config.isTargetDevice()) {
6849 Value *ExecMode =
6850 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
6851 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
6852 }
6853
6854 // Save insert point.
6855 IRBuilder<>::InsertPointGuard IPG(Builder);
6856 // If there's a DISubprogram associated with current function, then
6857 // generate one for the outlined function.
6858 if (Function *ParentFunc = BB->getParent()) {
6859 if (DISubprogram *SP = ParentFunc->getSubprogram()) {
6860 DICompileUnit *CU = SP->getUnit();
6861 DIBuilder DB(*M, true, CU);
6863 if (DL) {
6864 // TODO: We are using nullopt for arguments at the moment. This will
6865 // need to be updated when debug data is being generated for variables.
6866 DISubroutineType *Ty =
6867 DB.createSubroutineType(DB.getOrCreateTypeArray({}));
6868 DISubprogram::DISPFlags SPFlags = DISubprogram::SPFlagDefinition |
6869 DISubprogram::SPFlagOptimized |
6870 DISubprogram::SPFlagLocalToUnit;
6871
6872 DISubprogram *OutlinedSP = DB.createFunction(
6873 CU, FuncName, FuncName, SP->getFile(), DL.getLine(), Ty,
6874 DL.getLine(), DINode::DIFlags::FlagArtificial, SPFlags);
6875
6876 // Attach subprogram to the function.
6877 Func->setSubprogram(OutlinedSP);
6878 // Update the CurrentDebugLocation in the builder so that right scope
6879 // is used for things inside outlined function.
6881 DILocation::get(Func->getContext(), DL.getLine(), DL.getCol(),
6882 OutlinedSP, DL.getInlinedAt()));
6883 }
6884 }
6885 }
6886
6887 // Generate the region into the function.
6888 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
6889 Builder.SetInsertPoint(EntryBB);
6890
6891 // Insert target init call in the device compilation pass.
6892 if (OMPBuilder.Config.isTargetDevice())
6893 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
6894
6895 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
6896
6897 // As we embed the user code in the middle of our target region after we
6898 // generate entry code, we must move what allocas we can into the entry
6899 // block to avoid possible breaking optimisations for device
6900 if (OMPBuilder.Config.isTargetDevice())
6902
6903 // Insert target deinit call in the device compilation pass.
6904 BasicBlock *OutlinedBodyBB =
6905 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
6907 Builder.saveIP(),
6908 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
6909 if (!AfterIP)
6910 return AfterIP.takeError();
6911 Builder.restoreIP(*AfterIP);
6912 if (OMPBuilder.Config.isTargetDevice())
6913 OMPBuilder.createTargetDeinit(Builder);
6914
6915 // Insert return instruction.
6916 Builder.CreateRetVoid();
6917
6918 // New Alloca IP at entry point of created device function.
6919 Builder.SetInsertPoint(EntryBB->getFirstNonPHI());
6920 auto AllocaIP = Builder.saveIP();
6921
6922 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
6923
6924 // Skip the artificial dyn_ptr on the device.
6925 const auto &ArgRange =
6926 OMPBuilder.Config.isTargetDevice()
6927 ? make_range(Func->arg_begin() + 1, Func->arg_end())
6928 : Func->args();
6929
6930 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
6931 // Things like GEP's can come in the form of Constants. Constants and
6932 // ConstantExpr's do not have access to the knowledge of what they're
6933 // contained in, so we must dig a little to find an instruction so we
6934 // can tell if they're used inside of the function we're outlining. We
6935 // also replace the original constant expression with a new instruction
6936 // equivalent; an instruction as it allows easy modification in the
6937 // following loop, as we can now know the constant (instruction) is
6938 // owned by our target function and replaceUsesOfWith can now be invoked
6939 // on it (cannot do this with constants it seems). A brand new one also
6940 // allows us to be cautious as it is perhaps possible the old expression
6941 // was used inside of the function but exists and is used externally
6942 // (unlikely by the nature of a Constant, but still).
6943 // NOTE: We cannot remove dead constants that have been rewritten to
6944 // instructions at this stage, we run the risk of breaking later lowering
6945 // by doing so as we could still be in the process of lowering the module
6946 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
6947 // constants we have created rewritten versions of.
6948 if (auto *Const = dyn_cast<Constant>(Input))
6949 convertUsersOfConstantsToInstructions(Const, Func, false);
6950
6951 // Collect all the instructions
6952 for (User *User : make_early_inc_range(Input->users()))
6953 if (auto *Instr = dyn_cast<Instruction>(User))
6954 if (Instr->getFunction() == Func)
6955 Instr->replaceUsesOfWith(Input, InputCopy);
6956 };
6957
6958 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
6959
6960 // Rewrite uses of input valus to parameters.
6961 for (auto InArg : zip(Inputs, ArgRange)) {
6962 Value *Input = std::get<0>(InArg);
6963 Argument &Arg = std::get<1>(InArg);
6964 Value *InputCopy = nullptr;
6965
6967 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
6968 if (!AfterIP)
6969 return AfterIP.takeError();
6970 Builder.restoreIP(*AfterIP);
6971
6972 // In certain cases a Global may be set up for replacement, however, this
6973 // Global may be used in multiple arguments to the kernel, just segmented
6974 // apart, for example, if we have a global array, that is sectioned into
6975 // multiple mappings (technically not legal in OpenMP, but there is a case
6976 // in Fortran for Common Blocks where this is neccesary), we will end up
6977 // with GEP's into this array inside the kernel, that refer to the Global
6978 // but are technically seperate arguments to the kernel for all intents and
6979 // purposes. If we have mapped a segment that requires a GEP into the 0-th
6980 // index, it will fold into an referal to the Global, if we then encounter
6981 // this folded GEP during replacement all of the references to the
6982 // Global in the kernel will be replaced with the argument we have generated
6983 // that corresponds to it, including any other GEP's that refer to the
6984 // Global that may be other arguments. This will invalidate all of the other
6985 // preceding mapped arguments that refer to the same global that may be
6986 // seperate segments. To prevent this, we defer global processing until all
6987 // other processing has been performed.
6988 if (llvm::isa<llvm::GlobalValue>(std::get<0>(InArg)) ||
6989 llvm::isa<llvm::GlobalObject>(std::get<0>(InArg)) ||
6990 llvm::isa<llvm::GlobalVariable>(std::get<0>(InArg))) {
6991 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
6992 continue;
6993 }
6994
6995 ReplaceValue(Input, InputCopy, Func);
6996 }
6997
6998 // Replace all of our deferred Input values, currently just Globals.
6999 for (auto Deferred : DeferredReplacement)
7000 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
7001
7002 return Func;
7003}
7004
7005/// Create an entry point for a target task with the following.
7006/// It'll have the following signature
7007/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
7008/// This function is called from emitTargetTask once the
7009/// code to launch the target kernel has been outlined already.
7011 IRBuilderBase &Builder,
7012 CallInst *StaleCI) {
7013 Module &M = OMPBuilder.M;
7014 // KernelLaunchFunction is the target launch function, i.e.
7015 // the function that sets up kernel arguments and calls
7016 // __tgt_target_kernel to launch the kernel on the device.
7017 //
7018 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
7019
7020 // StaleCI is the CallInst which is the call to the outlined
7021 // target kernel launch function. If there are values that the
7022 // outlined function uses then these are aggregated into a structure
7023 // which is passed as the second argument. If not, then there's
7024 // only one argument, the threadID. So, StaleCI can be
7025 //
7026 // %structArg = alloca { ptr, ptr }, align 8
7027 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
7028 // store ptr %20, ptr %gep_, align 8
7029 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
7030 // store ptr %21, ptr %gep_8, align 8
7031 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
7032 //
7033 // OR
7034 //
7035 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
7037 StaleCI->getIterator());
7038 LLVMContext &Ctx = StaleCI->getParent()->getContext();
7039 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
7040 Type *TaskPtrTy = OMPBuilder.TaskPtr;
7041 Type *TaskTy = OMPBuilder.Task;
7042 auto ProxyFnTy =
7043 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
7044 /* isVarArg */ false);
7045 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
7046 ".omp_target_task_proxy_func",
7047 Builder.GetInsertBlock()->getModule());
7048 ProxyFn->getArg(0)->setName("thread.id");
7049 ProxyFn->getArg(1)->setName("task");
7050
7051 BasicBlock *EntryBB =
7052 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
7053 Builder.SetInsertPoint(EntryBB);
7054
7055 bool HasShareds = StaleCI->arg_size() > 1;
7056 // TODO: This is a temporary assert to prove to ourselves that
7057 // the outlined target launch function is always going to have
7058 // atmost two arguments if there is any data shared between
7059 // host and device.
7060 assert((!HasShareds || (StaleCI->arg_size() == 2)) &&
7061 "StaleCI with shareds should have exactly two arguments.");
7062 if (HasShareds) {
7063 auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
7064 assert(ArgStructAlloca &&
7065 "Unable to find the alloca instruction corresponding to arguments "
7066 "for extracted function");
7067 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
7068
7069 AllocaInst *NewArgStructAlloca =
7070 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
7071 Value *TaskT = ProxyFn->getArg(1);
7072 Value *ThreadId = ProxyFn->getArg(0);
7073 Value *SharedsSize =
7074 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
7075
7076 Value *Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
7077 LoadInst *LoadShared =
7078 Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
7079
7080 Builder.CreateMemCpy(
7081 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
7082 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
7083
7084 Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca});
7085 }
7086 Builder.CreateRetVoid();
7087 return ProxyFn;
7088}
7089
7091 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
7092 TargetRegionEntryInfo &EntryInfo,
7094 Function *&OutlinedFn, Constant *&OutlinedFnID,
7098
7099 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
7100 [&](StringRef EntryFnName) {
7101 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
7102 EntryFnName, Inputs, CBFunc,
7103 ArgAccessorFuncCB);
7104 };
7105
7106 return OMPBuilder.emitTargetRegionFunction(
7107 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
7108 OutlinedFnID);
7109}
7110
7112 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
7115 bool HasNoWait) {
7116
7117 // The following explains the code-gen scenario for the `target` directive. A
7118 // similar scneario is followed for other device-related directives (e.g.
7119 // `target enter data`) but in similar fashion since we only need to emit task
7120 // that encapsulates the proper runtime call.
7121 //
7122 // When we arrive at this function, the target region itself has been
7123 // outlined into the function OutlinedFn.
7124 // So at ths point, for
7125 // --------------------------------------------------
7126 // void user_code_that_offloads(...) {
7127 // omp target depend(..) map(from:a) map(to:b, c)
7128 // a = b + c
7129 // }
7130 //
7131 // --------------------------------------------------
7132 //
7133 // we have
7134 //
7135 // --------------------------------------------------
7136 //
7137 // void user_code_that_offloads(...) {
7138 // %.offload_baseptrs = alloca [3 x ptr], align 8
7139 // %.offload_ptrs = alloca [3 x ptr], align 8
7140 // %.offload_mappers = alloca [3 x ptr], align 8
7141 // ;; target region has been outlined and now we need to
7142 // ;; offload to it via a target task.
7143 // }
7144 // void outlined_device_function(ptr a, ptr b, ptr c) {
7145 // *a = *b + *c
7146 // }
7147 //
7148 // We have to now do the following
7149 // (i) Make an offloading call to outlined_device_function using the OpenMP
7150 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
7151 // emitted by emitKernelLaunch
7152 // (ii) Create a task entry point function that calls kernel_launch_function
7153 // and is the entry point for the target task. See
7154 // '@.omp_target_task_proxy_func in the pseudocode below.
7155 // (iii) Create a task with the task entry point created in (ii)
7156 //
7157 // That is we create the following
7158 //
7159 // void user_code_that_offloads(...) {
7160 // %.offload_baseptrs = alloca [3 x ptr], align 8
7161 // %.offload_ptrs = alloca [3 x ptr], align 8
7162 // %.offload_mappers = alloca [3 x ptr], align 8
7163 //
7164 // %structArg = alloca { ptr, ptr, ptr }, align 8
7165 // %strucArg[0] = %.offload_baseptrs
7166 // %strucArg[1] = %.offload_ptrs
7167 // %strucArg[2] = %.offload_mappers
7168 // proxy_target_task = @__kmpc_omp_task_alloc(...,
7169 // @.omp_target_task_proxy_func)
7170 // memcpy(proxy_target_task->shareds, %structArg, sizeof(structArg))
7171 // dependencies_array = ...
7172 // ;; if nowait not present
7173 // call @__kmpc_omp_wait_deps(..., dependencies_array)
7174 // call @__kmpc_omp_task_begin_if0(...)
7175 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
7176 // %proxy_target_task) call @__kmpc_omp_task_complete_if0(...)
7177 // }
7178 //
7179 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
7180 // ptr %task) {
7181 // %structArg = alloca {ptr, ptr, ptr}
7182 // %shared_data = load (getelementptr %task, 0, 0)
7183 // mempcy(%structArg, %shared_data, sizeof(structArg))
7184 // kernel_launch_function(%thread.id, %structArg)
7185 // }
7186 //
7187 // We need the proxy function because the signature of the task entry point
7188 // expected by kmpc_omp_task is always the same and will be different from
7189 // that of the kernel_launch function.
7190 //
7191 // kernel_launch_function is generated by emitKernelLaunch and has the
7192 // always_inline attribute.
7193 // void kernel_launch_function(thread_id,
7194 // structArg) alwaysinline {
7195 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
7196 // offload_baseptrs = load(getelementptr structArg, 0, 0)
7197 // offload_ptrs = load(getelementptr structArg, 0, 1)
7198 // offload_mappers = load(getelementptr structArg, 0, 2)
7199 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
7200 // ; offload_mappers
7201 // call i32 @__tgt_target_kernel(...,
7202 // outlined_device_function,
7203 // ptr %kernel_args)
7204 // }
7205 // void outlined_device_function(ptr a, ptr b, ptr c) {
7206 // *a = *b + *c
7207 // }
7208 //
7209 BasicBlock *TargetTaskBodyBB =
7210 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
7211 BasicBlock *TargetTaskAllocaBB =
7212 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
7213
7214 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
7215 TargetTaskAllocaBB->begin());
7216 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
7217
7218 OutlineInfo OI;
7219 OI.EntryBB = TargetTaskAllocaBB;
7220 OI.OuterAllocaBB = AllocaIP.getBlock();
7221
7222 // Add the thread ID argument.
7225 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
7226
7227 Builder.restoreIP(TargetTaskBodyIP);
7228
7229 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
7230 return Err;
7231
7232 OI.ExitBB = Builder.saveIP().getBlock();
7233 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, HasNoWait,
7234 DeviceID](Function &OutlinedFn) mutable {
7235 assert(OutlinedFn.getNumUses() == 1 &&
7236 "there must be a single user for the outlined function");
7237
7238 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
7239 bool HasShareds = StaleCI->arg_size() > 1;
7240
7241 Function *ProxyFn = emitTargetTaskProxyFunction(*this, Builder, StaleCI);
7242
7243 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
7244 << "\n");
7245
7246 Builder.SetInsertPoint(StaleCI);
7247
7248 // Gather the arguments for emitting the runtime call.
7249 uint32_t SrcLocStrSize;
7250 Constant *SrcLocStr =
7252 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7253
7254 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
7255 //
7256 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
7257 // the DeviceID to the deferred task and also since
7258 // @__kmpc_omp_target_task_alloc creates an untied/async task.
7259 Function *TaskAllocFn =
7260 !HasNoWait ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
7262 OMPRTL___kmpc_omp_target_task_alloc);
7263
7264 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
7265 // call.
7266 Value *ThreadID = getOrCreateThreadID(Ident);
7267
7268 // Argument - `sizeof_kmp_task_t` (TaskSize)
7269 // Tasksize refers to the size in bytes of kmp_task_t data structure
7270 // including private vars accessed in task.
7271 // TODO: add kmp_task_t_with_privates (privates)
7272 Value *TaskSize =
7274
7275 // Argument - `sizeof_shareds` (SharedsSize)
7276 // SharedsSize refers to the shareds array size in the kmp_task_t data
7277 // structure.
7278 Value *SharedsSize = Builder.getInt64(0);
7279 if (HasShareds) {
7280 auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
7281 assert(ArgStructAlloca &&
7282 "Unable to find the alloca instruction corresponding to arguments "
7283 "for extracted function");
7284 auto *ArgStructType =
7285 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
7286 assert(ArgStructType && "Unable to find struct type corresponding to "
7287 "arguments for extracted function");
7288 SharedsSize =
7290 }
7291
7292 // Argument - `flags`
7293 // Task is tied iff (Flags & 1) == 1.
7294 // Task is untied iff (Flags & 1) == 0.
7295 // Task is final iff (Flags & 2) == 2.
7296 // Task is not final iff (Flags & 2) == 0.
7297 // A target task is not final and is untied.
7299
7300 // Emit the @__kmpc_omp_task_alloc runtime call
7301 // The runtime call returns a pointer to an area where the task captured
7302 // variables must be copied before the task is run (TaskData)
7303 CallInst *TaskData = nullptr;
7304
7305 SmallVector<llvm::Value *> TaskAllocArgs = {
7306 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
7307 /*flags=*/Flags,
7308 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
7309 /*task_func=*/ProxyFn};
7310
7311 if (HasNoWait)
7312 TaskAllocArgs.push_back(DeviceID);
7313
7314 TaskData = Builder.CreateCall(TaskAllocFn, TaskAllocArgs);
7315
7316 if (HasShareds) {
7317 Value *Shareds = StaleCI->getArgOperand(1);
7318 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
7319 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
7320 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
7321 SharedsSize);
7322 }
7323
7324 Value *DepArray = emitTaskDependencies(*this, Dependencies);
7325
7326 // ---------------------------------------------------------------
7327 // V5.2 13.8 target construct
7328 // If the nowait clause is present, execution of the target task
7329 // may be deferred. If the nowait clause is not present, the target task is
7330 // an included task.
7331 // ---------------------------------------------------------------
7332 // The above means that the lack of a nowait on the target construct
7333 // translates to '#pragma omp task if(0)'
7334 if (!HasNoWait) {
7335 if (DepArray) {
7336 Function *TaskWaitFn =
7337 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
7339 TaskWaitFn,
7340 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
7341 /*ndeps=*/Builder.getInt32(Dependencies.size()),
7342 /*dep_list=*/DepArray,
7343 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
7344 /*noalias_dep_list=*/
7346 }
7347 // Included task.
7348 Function *TaskBeginFn =
7349 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
7350 Function *TaskCompleteFn =
7351 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
7352 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
7353 CallInst *CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData});
7354 CI->setDebugLoc(StaleCI->getDebugLoc());
7355 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
7356 } else if (DepArray) {
7357 // HasNoWait - meaning the task may be deferred. Call
7358 // __kmpc_omp_task_with_deps if there are dependencies,
7359 // else call __kmpc_omp_task
7360 Function *TaskFn =
7361 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
7363 TaskFn,
7364 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
7365 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
7367 } else {
7368 // Emit the @__kmpc_omp_task runtime call to spawn the task
7369 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
7370 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
7371 }
7372
7373 StaleCI->eraseFromParent();
7374 for (Instruction *I : llvm::reverse(ToBeDeleted))
7375 I->eraseFromParent();
7376 };
7377 addOutlineInfo(std::move(OI));
7378
7379 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
7380 << *(Builder.GetInsertBlock()) << "\n");
7381 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
7383 << "\n");
7384 return Builder.saveIP();
7385}
7386
7388 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
7389 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, bool IsNonContiguous,
7390 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB,
7391 function_ref<Value *(unsigned int)> CustomMapperCB) {
7392 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info, IsNonContiguous,
7393 DeviceAddrCB, CustomMapperCB);
7394 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
7395}
7396
7397static void
7402 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
7406 bool HasNoWait = false) {
7407 // Generate a function call to the host fallback implementation of the target
7408 // region. This is called by the host when no offload entry was generated for
7409 // the target region and when the offloading call fails at runtime.
7410 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
7412 Builder.restoreIP(IP);
7413 Builder.CreateCall(OutlinedFn, Args);
7414 return Builder.saveIP();
7415 };
7416
7417 bool HasDependencies = Dependencies.size() > 0;
7418 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
7419
7421
7422 auto TaskBodyCB =
7423 [&](Value *DeviceID, Value *RTLoc,
7424 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
7425 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
7426 // produce any.
7428 // emitKernelLaunch makes the necessary runtime call to offload the
7429 // kernel. We then outline all that code into a separate function
7430 // ('kernel_launch_function' in the pseudo code above). This function is
7431 // then called by the target task proxy function (see
7432 // '@.omp_target_task_proxy_func' in the pseudo code above)
7433 // "@.omp_target_task_proxy_func' is generated by
7434 // emitTargetTaskProxyFunction.
7435 if (OutlinedFnID)
7436 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
7437 EmitTargetCallFallbackCB, KArgs,
7438 DeviceID, RTLoc, TargetTaskAllocaIP);
7439 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
7440 // In this case, we execute the host implementation directly.
7441 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
7442 }());
7443
7444 OMPBuilder.Builder.restoreIP(AfterIP);
7445 return Error::success();
7446 };
7447
7448 auto &&EmitTargetCallElse =
7449 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
7451 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
7452 // produce any.
7454 if (RequiresOuterTargetTask) {
7455 // Arguments that are intended to be directly forwarded to an
7456 // emitKernelLaunch call are pased as nullptr, since
7457 // OutlinedFnID=nullptr results in that call not being done.
7458 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
7459 /*RTLoc=*/nullptr, AllocaIP,
7460 Dependencies, HasNoWait);
7461 }
7462 return EmitTargetCallFallbackCB(Builder.saveIP());
7463 }());
7464
7465 Builder.restoreIP(AfterIP);
7466 return Error::success();
7467 };
7468
7469 auto &&EmitTargetCallThen =
7470 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
7473 /*RequiresDevicePointerInfo=*/false,
7474 /*SeparateBeginEndCalls=*/true);
7475
7476 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
7478 OMPBuilder.emitOffloadingArraysAndArgs(AllocaIP, Builder.saveIP(), Info,
7479 RTArgs, MapInfo,
7480 /*IsNonContiguous=*/true,
7481 /*ForEndCall=*/false);
7482
7483 SmallVector<Value *, 3> NumTeamsC;
7484 for (auto [DefaultVal, RuntimeVal] :
7485 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
7486 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
7487 : Builder.getInt32(DefaultVal));
7488
7489 // Calculate number of threads: 0 if no clauses specified, otherwise it is
7490 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
7491 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
7492 if (Clause)
7493 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
7494 /*isSigned=*/false);
7495 return Clause;
7496 };
7497 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
7498 if (Clause)
7499 Result =
7500 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
7501 Result, Clause)
7502 : Clause;
7503 };
7504
7505 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
7506 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
7507 SmallVector<Value *, 3> NumThreadsC;
7508 Value *MaxThreadsClause =
7509 RuntimeAttrs.TeamsThreadLimit.size() == 1
7510 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
7511 : nullptr;
7512
7513 for (auto [TeamsVal, TargetVal] : zip_equal(
7514 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
7515 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
7516 Value *NumThreads = InitMaxThreadsClause(TargetVal);
7517
7518 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
7519 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
7520
7521 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
7522 }
7523
7524 unsigned NumTargetItems = Info.NumberOfPtrs;
7525 // TODO: Use correct device ID
7526 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
7527 uint32_t SrcLocStrSize;
7528 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
7529 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
7530 llvm::omp::IdentFlag(0), 0);
7531
7532 Value *TripCount = RuntimeAttrs.LoopTripCount
7533 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
7534 Builder.getInt64Ty(),
7535 /*isSigned=*/false)
7536 : Builder.getInt64(0);
7537
7538 // TODO: Use correct DynCGGroupMem
7539 Value *DynCGGroupMem = Builder.getInt32(0);
7540
7541 KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount,
7542 NumTeamsC, NumThreadsC,
7543 DynCGGroupMem, HasNoWait);
7544
7545 // Assume no error was returned because TaskBodyCB and
7546 // EmitTargetCallFallbackCB don't produce any.
7548 // The presence of certain clauses on the target directive require the
7549 // explicit generation of the target task.
7550 if (RequiresOuterTargetTask)
7551 return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP,
7552 Dependencies, HasNoWait);
7553
7554 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
7555 EmitTargetCallFallbackCB, KArgs,
7556 DeviceID, RTLoc, AllocaIP);
7557 }());
7558
7559 Builder.restoreIP(AfterIP);
7560 return Error::success();
7561 };
7562
7563 // If we don't have an ID for the target region, it means an offload entry
7564 // wasn't created. In this case we just run the host fallback directly and
7565 // ignore any potential 'if' clauses.
7566 if (!OutlinedFnID) {
7567 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
7568 return;
7569 }
7570
7571 // If there's no 'if' clause, only generate the kernel launch code path.
7572 if (!IfCond) {
7573 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
7574 return;
7575 }
7576
7577 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
7578 EmitTargetCallElse, AllocaIP));
7579}
7580
7582 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
7583 InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo,
7584 const TargetKernelDefaultAttrs &DefaultAttrs,
7585 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
7589 SmallVector<DependData> Dependencies, bool HasNowait) {
7590
7591 if (!updateToLocation(Loc))
7592 return InsertPointTy();
7593
7594 Builder.restoreIP(CodeGenIP);
7595
7596 Function *OutlinedFn;
7597 Constant *OutlinedFnID = nullptr;
7598 // The target region is outlined into its own function. The LLVM IR for
7599 // the target region itself is generated using the callbacks CBFunc
7600 // and ArgAccessorFuncCB
7602 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
7603 OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB))
7604 return Err;
7605
7606 // If we are not on the target device, then we need to generate code
7607 // to make a remote call (offload) to the previously outlined function
7608 // that represents the target region. Do that now.
7609 if (!Config.isTargetDevice())
7610 emitTargetCall(*this, Builder, AllocaIP, DefaultAttrs, RuntimeAttrs, IfCond,
7611 OutlinedFn, OutlinedFnID, Args, GenMapInfoCB, Dependencies,
7612 HasNowait);
7613 return Builder.saveIP();
7614}
7615
7616std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
7617 StringRef FirstSeparator,
7618 StringRef Separator) {
7619 SmallString<128> Buffer;
7621 StringRef Sep = FirstSeparator;
7622 for (StringRef Part : Parts) {
7623 OS << Sep << Part;
7624 Sep = Separator;
7625 }
7626 return OS.str().str();
7627}
7628
7629std::string
7631 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
7632 Config.separator());
7633}
7634
7637 unsigned AddressSpace) {
7638 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
7639 if (Elem.second) {
7640 assert(Elem.second->getValueType() == Ty &&
7641 "OMP internal variable has different type than requested");
7642 } else {
7643 // TODO: investigate the appropriate linkage type used for the global
7644 // variable for possibly changing that to internal or private, or maybe
7645 // create different versions of the function for different OMP internal
7646 // variables.
7647 auto Linkage = this->M.getTargetTriple().rfind("wasm32") == 0
7650 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
7651 Constant::getNullValue(Ty), Elem.first(),
7652 /*InsertBefore=*/nullptr,
7654 const DataLayout &DL = M.getDataLayout();
7655 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
7656 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
7657 GV->setAlignment(std::max(TypeAlign, PtrAlign));
7658 Elem.second = GV;
7659 }
7660
7661 return Elem.second;
7662}
7663
7664Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
7665 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
7666 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
7667 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
7668}
7669
7672 Value *Null =
7673 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
7674 Value *SizeGep =
7675 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
7676 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
7677 return SizePtrToInt;
7678}
7679
7682 std::string VarName) {
7683 llvm::Constant *MaptypesArrayInit =
7685 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
7686 M, MaptypesArrayInit->getType(),
7687 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
7688 VarName);
7689 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
7690 return MaptypesArrayGlobal;
7691}
7692
7694 InsertPointTy AllocaIP,
7695 unsigned NumOperands,
7696 struct MapperAllocas &MapperAllocas) {
7697 if (!updateToLocation(Loc))
7698 return;
7699
7700 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
7701 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
7702 Builder.restoreIP(AllocaIP);
7703 AllocaInst *ArgsBase = Builder.CreateAlloca(
7704 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
7705 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
7706 ".offload_ptrs");
7707 AllocaInst *ArgSizes = Builder.CreateAlloca(
7708 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
7709 Builder.restoreIP(Loc.IP);
7710 MapperAllocas.ArgsBase = ArgsBase;
7711 MapperAllocas.Args = Args;
7712 MapperAllocas.ArgSizes = ArgSizes;
7713}
7714
7716 Function *MapperFunc, Value *SrcLocInfo,
7717 Value *MaptypesArg, Value *MapnamesArg,
7719 int64_t DeviceID, unsigned NumOperands) {
7720 if (!updateToLocation(Loc))
7721 return;
7722
7723 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
7724 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
7725 Value *ArgsBaseGEP =
7727 {Builder.getInt32(0), Builder.getInt32(0)});
7728 Value *ArgsGEP =
7730 {Builder.getInt32(0), Builder.getInt32(0)});
7731 Value *ArgSizesGEP =
7733 {Builder.getInt32(0), Builder.getInt32(0)});
7734 Value *NullPtr =
7735 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
7736 Builder.CreateCall(MapperFunc,
7737 {SrcLocInfo, Builder.getInt64(DeviceID),
7738 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP,
7739 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr});
7740}
7741
7743 TargetDataRTArgs &RTArgs,
7744 TargetDataInfo &Info,
7745 bool ForEndCall) {
7746 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
7747 "expected region end call to runtime only when end call is separate");
7748 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
7749 auto VoidPtrTy = UnqualPtrTy;
7750 auto VoidPtrPtrTy = UnqualPtrTy;
7751 auto Int64Ty = Type::getInt64Ty(M.getContext());
7752 auto Int64PtrTy = UnqualPtrTy;
7753
7754 if (!Info.NumberOfPtrs) {
7755 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7756 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7757 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
7758 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
7759 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
7760 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7761 return;
7762 }
7763
7765 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
7766 Info.RTArgs.BasePointersArray,
7767 /*Idx0=*/0, /*Idx1=*/0);
7769 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
7770 /*Idx0=*/0,
7771 /*Idx1=*/0);
7773 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
7774 /*Idx0=*/0, /*Idx1=*/0);
7776 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
7777 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
7778 : Info.RTArgs.MapTypesArray,
7779 /*Idx0=*/0,
7780 /*Idx1=*/0);
7781
7782 // Only emit the mapper information arrays if debug information is
7783 // requested.
7784 if (!Info.EmitDebug)
7785 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
7786 else
7788 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
7789 /*Idx0=*/0,
7790 /*Idx1=*/0);
7791 // If there is no user-defined mapper, set the mapper array to nullptr to
7792 // avoid an unnecessary data privatization
7793 if (!Info.HasMapper)
7794 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7795 else
7796 RTArgs.MappersArray =
7797 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
7798}
7799
7801 InsertPointTy CodeGenIP,
7802 MapInfosTy &CombinedInfo,
7803 TargetDataInfo &Info) {
7805 CombinedInfo.NonContigInfo;
7806
7807 // Build an array of struct descriptor_dim and then assign it to
7808 // offload_args.
7809 //
7810 // struct descriptor_dim {
7811 // uint64_t offset;
7812 // uint64_t count;
7813 // uint64_t stride
7814 // };
7815 Type *Int64Ty = Builder.getInt64Ty();
7817 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
7818 "struct.descriptor_dim");
7819
7820 enum { OffsetFD = 0, CountFD, StrideFD };
7821 // We need two index variable here since the size of "Dims" is the same as
7822 // the size of Components, however, the size of offset, count, and stride is
7823 // equal to the size of base declaration that is non-contiguous.
7824 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
7825 // Skip emitting ir if dimension size is 1 since it cannot be
7826 // non-contiguous.
7827 if (NonContigInfo.Dims[I] == 1)
7828 continue;
7829 Builder.restoreIP(AllocaIP);
7830 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
7831 AllocaInst *DimsAddr =
7832 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
7833 Builder.restoreIP(CodeGenIP);
7834 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
7835 unsigned RevIdx = EE - II - 1;
7836 Value *DimsLVal = Builder.CreateInBoundsGEP(
7837 DimsAddr->getAllocatedType(), DimsAddr,
7838 {Builder.getInt64(0), Builder.getInt64(II)});
7839 // Offset
7840 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
7842 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
7843 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
7844 // Count
7845 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
7847 NonContigInfo.Counts[L][RevIdx], CountLVal,
7848 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
7849 // Stride
7850 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
7852 NonContigInfo.Strides[L][RevIdx], StrideLVal,
7853 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
7854 }
7855 // args[I] = &dims
7856 Builder.restoreIP(CodeGenIP);
7858 DimsAddr, Builder.getPtrTy());
7860 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
7861 Info.RTArgs.PointersArray, 0, I);
7864 ++L;
7865 }
7866}
7867
7868void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
7869 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
7870 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
7871 BasicBlock *ExitBB, bool IsInit) {
7872 StringRef Prefix = IsInit ? ".init" : ".del";
7873
7874 // Evaluate if this is an array section.
7876 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
7877 Value *IsArray =
7878 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
7879 Value *DeleteBit = Builder.CreateAnd(
7880 MapType,
7882 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7883 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
7884 Value *DeleteCond;
7885 Value *Cond;
7886 if (IsInit) {
7887 // base != begin?
7888 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
7889 // IsPtrAndObj?
7890 Value *PtrAndObjBit = Builder.CreateAnd(
7891 MapType,
7893 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7894 OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ)));
7895 PtrAndObjBit = Builder.CreateIsNotNull(PtrAndObjBit);
7896 BaseIsBegin = Builder.CreateAnd(BaseIsBegin, PtrAndObjBit);
7897 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
7898 DeleteCond = Builder.CreateIsNull(
7899 DeleteBit,
7900 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
7901 } else {
7902 Cond = IsArray;
7903 DeleteCond = Builder.CreateIsNotNull(
7904 DeleteBit,
7905 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
7906 }
7907 Cond = Builder.CreateAnd(Cond, DeleteCond);
7908 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
7909
7910 emitBlock(BodyBB, MapperFn);
7911 // Get the array size by multiplying element size and element number (i.e., \p
7912 // Size).
7913 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
7914 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
7915 // memory allocation/deletion purpose only.
7916 Value *MapTypeArg = Builder.CreateAnd(
7917 MapType,
7919 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7920 OpenMPOffloadMappingFlags::OMP_MAP_TO |
7921 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
7922 MapTypeArg = Builder.CreateOr(
7923 MapTypeArg,
7925 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7926 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
7927
7928 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
7929 // data structure.
7930 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
7931 ArraySize, MapTypeArg, MapName};
7933 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
7934 OffloadingArgs);
7935}
7936
7938 function_ref<MapInfosTy &(InsertPointTy CodeGenIP, llvm::Value *PtrPHI,
7939 llvm::Value *BeginArg)>
7940 GenMapInfoCB,
7941 Type *ElemTy, StringRef FuncName,
7942 function_ref<bool(unsigned int, Function **)> CustomMapperCB) {
7943 SmallVector<Type *> Params;
7944 Params.emplace_back(Builder.getPtrTy());
7945 Params.emplace_back(Builder.getPtrTy());
7946 Params.emplace_back(Builder.getPtrTy());
7949 Params.emplace_back(Builder.getPtrTy());
7950
7951 auto *FnTy =
7952 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
7953
7954 SmallString<64> TyStr;
7955 raw_svector_ostream Out(TyStr);
7956 Function *MapperFn =
7958 MapperFn->addFnAttr(Attribute::NoInline);
7959 MapperFn->addFnAttr(Attribute::NoUnwind);
7960 MapperFn->addParamAttr(0, Attribute::NoUndef);
7961 MapperFn->addParamAttr(1, Attribute::NoUndef);
7962 MapperFn->addParamAttr(2, Attribute::NoUndef);
7963 MapperFn->addParamAttr(3, Attribute::NoUndef);
7964 MapperFn->addParamAttr(4, Attribute::NoUndef);
7965 MapperFn->addParamAttr(5, Attribute::NoUndef);
7966
7967 // Start the mapper function code generation.
7968 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
7969 auto SavedIP = Builder.saveIP();
7970 Builder.SetInsertPoint(EntryBB);
7971
7972 Value *MapperHandle = MapperFn->getArg(0);
7973 Value *BaseIn = MapperFn->getArg(1);
7974 Value *BeginIn = MapperFn->getArg(2);
7975 Value *Size = MapperFn->getArg(3);
7976 Value *MapType = MapperFn->getArg(4);
7977 Value *MapName = MapperFn->getArg(5);
7978
7979 // Compute the starting and end addresses of array elements.
7980 // Prepare common arguments for array initiation and deletion.
7981 // Convert the size in bytes into the number of array elements.
7982 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
7984 Value *PtrBegin = Builder.CreateBitCast(BeginIn, Builder.getPtrTy());
7985 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
7986
7987 // Emit array initiation if this is an array section and \p MapType indicates
7988 // that memory allocation is required.
7989 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
7990 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
7991 MapType, MapName, ElementSize, HeadBB,
7992 /*IsInit=*/true);
7993
7994 // Emit a for loop to iterate through SizeArg of elements and map all of them.
7995
7996 // Emit the loop header block.
7997 emitBlock(HeadBB, MapperFn);
7998 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
7999 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
8000 // Evaluate whether the initial condition is satisfied.
8001 Value *IsEmpty =
8002 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
8003 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
8004
8005 // Emit the loop body block.
8006 emitBlock(BodyBB, MapperFn);
8007 BasicBlock *LastBB = BodyBB;
8008 PHINode *PtrPHI =
8009 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
8010 PtrPHI->addIncoming(PtrBegin, HeadBB);
8011
8012 // Get map clause information. Fill up the arrays with all mapped variables.
8013 MapInfosTy &Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
8014
8015 // Call the runtime API __tgt_mapper_num_components to get the number of
8016 // pre-existing components.
8017 Value *OffloadingArgs[] = {MapperHandle};
8018 Value *PreviousSize = Builder.CreateCall(
8019 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
8020 OffloadingArgs);
8021 Value *ShiftedPreviousSize =
8023
8024 // Fill up the runtime mapper handle for all components.
8025 for (unsigned I = 0; I < Info.BasePointers.size(); ++I) {
8026 Value *CurBaseArg =
8027 Builder.CreateBitCast(Info.BasePointers[I], Builder.getPtrTy());
8028 Value *CurBeginArg =
8030 Value *CurSizeArg = Info.Sizes[I];
8031 Value *CurNameArg = Info.Names.size()
8032 ? Info.Names[I]
8034
8035 // Extract the MEMBER_OF field from the map type.
8036 Value *OriMapType = Builder.getInt64(
8037 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8038 Info.Types[I]));
8039 Value *MemberMapType =
8040 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
8041
8042 // Combine the map type inherited from user-defined mapper with that
8043 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
8044 // bits of the \a MapType, which is the input argument of the mapper
8045 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
8046 // bits of MemberMapType.
8047 // [OpenMP 5.0], 1.2.6. map-type decay.
8048 // | alloc | to | from | tofrom | release | delete
8049 // ----------------------------------------------------------
8050 // alloc | alloc | alloc | alloc | alloc | release | delete
8051 // to | alloc | to | alloc | to | release | delete
8052 // from | alloc | alloc | from | from | release | delete
8053 // tofrom | alloc | to | from | tofrom | release | delete
8054 Value *LeftToFrom = Builder.CreateAnd(
8055 MapType,
8057 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8058 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8059 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8060 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
8061 BasicBlock *AllocElseBB =
8062 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
8063 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
8064 BasicBlock *ToElseBB =
8065 BasicBlock::Create(M.getContext(), "omp.type.to.else");
8066 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
8067 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
8068 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
8069 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
8070 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
8071 emitBlock(AllocBB, MapperFn);
8072 Value *AllocMapType = Builder.CreateAnd(
8073 MemberMapType,
8075 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8076 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8077 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8078 Builder.CreateBr(EndBB);
8079 emitBlock(AllocElseBB, MapperFn);
8080 Value *IsTo = Builder.CreateICmpEQ(
8081 LeftToFrom,
8083 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8084 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8085 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
8086 // In case of to, clear OMP_MAP_FROM.
8087 emitBlock(ToBB, MapperFn);
8088 Value *ToMapType = Builder.CreateAnd(
8089 MemberMapType,
8091 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8092 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8093 Builder.CreateBr(EndBB);
8094 emitBlock(ToElseBB, MapperFn);
8095 Value *IsFrom = Builder.CreateICmpEQ(
8096 LeftToFrom,
8098 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8099 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8100 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
8101 // In case of from, clear OMP_MAP_TO.
8102 emitBlock(FromBB, MapperFn);
8103 Value *FromMapType = Builder.CreateAnd(
8104 MemberMapType,
8106 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8107 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8108 // In case of tofrom, do nothing.
8109 emitBlock(EndBB, MapperFn);
8110 LastBB = EndBB;
8111 PHINode *CurMapType =
8112 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
8113 CurMapType->addIncoming(AllocMapType, AllocBB);
8114 CurMapType->addIncoming(ToMapType, ToBB);
8115 CurMapType->addIncoming(FromMapType, FromBB);
8116 CurMapType->addIncoming(MemberMapType, ToElseBB);
8117
8118 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
8119 CurSizeArg, CurMapType, CurNameArg};
8120 Function *ChildMapperFn = nullptr;
8121 if (CustomMapperCB && CustomMapperCB(I, &ChildMapperFn)) {
8122 // Call the corresponding mapper function.
8123 Builder.CreateCall(ChildMapperFn, OffloadingArgs)->setDoesNotThrow();
8124 } else {
8125 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8126 // data structure.
8128 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8129 OffloadingArgs);
8130 }
8131 }
8132
8133 // Update the pointer to point to the next element that needs to be mapped,
8134 // and check whether we have mapped all elements.
8135 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
8136 "omp.arraymap.next");
8137 PtrPHI->addIncoming(PtrNext, LastBB);
8138 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
8139 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
8140 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
8141
8142 emitBlock(ExitBB, MapperFn);
8143 // Emit array deletion if this is an array section and \p MapType indicates
8144 // that deletion is required.
8145 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8146 MapType, MapName, ElementSize, DoneBB,
8147 /*IsInit=*/false);
8148
8149 // Emit the function exit block.
8150 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
8151
8153 Builder.restoreIP(SavedIP);
8154 return MapperFn;
8155}
8156
8158 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
8159 TargetDataInfo &Info, bool IsNonContiguous,
8160 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
8161 function_ref<Value *(unsigned int)> CustomMapperCB) {
8162
8163 // Reset the array information.
8164 Info.clearArrayInfo();
8165 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
8166
8167 if (Info.NumberOfPtrs == 0)
8168 return;
8169
8170 Builder.restoreIP(AllocaIP);
8171 // Detect if we have any capture size requiring runtime evaluation of the
8172 // size so that a constant array could be eventually used.
8173 ArrayType *PointerArrayType =
8174 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
8175
8176 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
8177 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
8178
8179 Info.RTArgs.PointersArray = Builder.CreateAlloca(
8180 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
8181 AllocaInst *MappersArray = Builder.CreateAlloca(
8182 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
8183 Info.RTArgs.MappersArray = MappersArray;
8184
8185 // If we don't have any VLA types or other types that require runtime
8186 // evaluation, we can use a constant array for the map sizes, otherwise we
8187 // need to fill up the arrays as we do for the pointers.
8188 Type *Int64Ty = Builder.getInt64Ty();
8189 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
8190 ConstantInt::get(Int64Ty, 0));
8191 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
8192 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
8193 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
8194 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
8195 if (IsNonContiguous &&
8196 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8197 CombinedInfo.Types[I] &
8198 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
8199 ConstSizes[I] =
8200 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
8201 else
8202 ConstSizes[I] = CI;
8203 continue;
8204 }
8205 }
8206 RuntimeSizes.set(I);
8207 }
8208
8209 if (RuntimeSizes.all()) {
8210 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
8211 Info.RTArgs.SizesArray = Builder.CreateAlloca(
8212 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
8213 Builder.restoreIP(CodeGenIP);
8214 } else {
8215 auto *SizesArrayInit = ConstantArray::get(
8216 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
8217 std::string Name = createPlatformSpecificName({"offload_sizes"});
8218 auto *SizesArrayGbl =
8219 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
8220 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
8221 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
8222
8223 if (!RuntimeSizes.any()) {
8224 Info.RTArgs.SizesArray = SizesArrayGbl;
8225 } else {
8226 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
8227 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
8228 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
8230 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
8231 Buffer->setAlignment(OffloadSizeAlign);
8232 Builder.restoreIP(CodeGenIP);
8234 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
8235 SizesArrayGbl, OffloadSizeAlign,
8237 IndexSize,
8238 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
8239
8240 Info.RTArgs.SizesArray = Buffer;
8241 }
8242 Builder.restoreIP(CodeGenIP);
8243 }
8244
8245 // The map types are always constant so we don't need to generate code to
8246 // fill arrays. Instead, we create an array constant.
8248 for (auto mapFlag : CombinedInfo.Types)
8249 Mapping.push_back(
8250 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8251 mapFlag));
8252 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
8253 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
8254 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
8255
8256 // The information types are only built if provided.
8257 if (!CombinedInfo.Names.empty()) {
8258 std::string MapnamesName = createPlatformSpecificName({"offload_mapnames"});
8259 auto *MapNamesArrayGbl =
8260 createOffloadMapnames(CombinedInfo.Names, MapnamesName);
8261 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
8262 Info.EmitDebug = true;
8263 } else {
8264 Info.RTArgs.MapNamesArray =
8266 Info.EmitDebug = false;
8267 }
8268
8269 // If there's a present map type modifier, it must not be applied to the end
8270 // of a region, so generate a separate map type array in that case.
8271 if (Info.separateBeginEndCalls()) {
8272 bool EndMapTypesDiffer = false;
8273 for (uint64_t &Type : Mapping) {
8274 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8275 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
8276 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8277 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
8278 EndMapTypesDiffer = true;
8279 }
8280 }
8281 if (EndMapTypesDiffer) {
8282 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
8283 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
8284 }
8285 }
8286
8287 PointerType *PtrTy = Builder.getPtrTy();
8288 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
8289 Value *BPVal = CombinedInfo.BasePointers[I];
8291 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
8292 0, I);
8293 Builder.CreateAlignedStore(BPVal, BP,
8295
8296 if (Info.requiresDevicePointerInfo()) {
8297 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
8298 CodeGenIP = Builder.saveIP();
8299 Builder.restoreIP(AllocaIP);
8300 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
8301 Builder.restoreIP(CodeGenIP);
8302 if (DeviceAddrCB)
8303 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
8304 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
8305 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
8306 if (DeviceAddrCB)
8307 DeviceAddrCB(I, BP);
8308 }
8309 }
8310
8311 Value *PVal = CombinedInfo.Pointers[I];
8313 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
8314 I);
8315 // TODO: Check alignment correct.
8318
8319 if (RuntimeSizes.test(I)) {
8321 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
8322 /*Idx0=*/0,
8323 /*Idx1=*/I);
8325 Int64Ty,
8326 /*isSigned=*/true),
8327 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
8328 }
8329 // Fill up the mapper array.
8330 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
8331 Value *MFunc = ConstantPointerNull::get(PtrTy);
8332 if (CustomMapperCB)
8333 if (Value *CustomMFunc = CustomMapperCB(I))
8334 MFunc = Builder.CreatePointerCast(CustomMFunc, PtrTy);
8336 MappersArray->getAllocatedType(), MappersArray,
8337 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
8339 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
8340 }
8341
8342 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
8343 Info.NumberOfPtrs == 0)
8344 return;
8345 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
8346}
8347
8350
8351 if (!CurBB || CurBB->getTerminator()) {
8352 // If there is no insert point or the previous block is already
8353 // terminated, don't touch it.
8354 } else {
8355 // Otherwise, create a fall-through branch.
8357 }
8358
8360}
8361
8363 bool IsFinished) {
8365
8366 // Fall out of the current block (if necessary).
8367 emitBranch(BB);
8368
8369 if (IsFinished && BB->use_empty()) {
8370 BB->eraseFromParent();
8371 return;
8372 }
8373
8374 // Place the block after the current block, if possible, or else at
8375 // the end of the function.
8376 if (CurBB && CurBB->getParent())
8377 CurFn->insert(std::next(CurBB->getIterator()), BB);
8378 else
8379 CurFn->insert(CurFn->end(), BB);
8381}
8382
8384 BodyGenCallbackTy ElseGen,
8385 InsertPointTy AllocaIP) {
8386 // If the condition constant folds and can be elided, try to avoid emitting
8387 // the condition and the dead arm of the if/else.
8388 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
8389 auto CondConstant = CI->getSExtValue();
8390 if (CondConstant)
8391 return ThenGen(AllocaIP, Builder.saveIP());
8392
8393 return ElseGen(AllocaIP, Builder.saveIP());
8394 }
8395
8397
8398 // Otherwise, the condition did not fold, or we couldn't elide it. Just
8399 // emit the conditional branch.
8400 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
8401 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
8402 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
8403 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
8404 // Emit the 'then' code.
8405 emitBlock(ThenBlock, CurFn);
8406 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
8407 return Err;
8408 emitBranch(ContBlock);
8409 // Emit the 'else' code if present.
8410 // There is no need to emit line number for unconditional branch.
8411 emitBlock(ElseBlock, CurFn);
8412 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
8413 return Err;
8414 // There is no need to emit line number for unconditional branch.
8415 emitBranch(ContBlock);
8416 // Emit the continuation block for code after the if.
8417 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
8418 return Error::success();
8419}
8420
8421bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
8422 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
8425 "Unexpected Atomic Ordering.");
8426
8427 bool Flush = false;
8429
8430 switch (AK) {
8431 case Read:
8434 FlushAO = AtomicOrdering::Acquire;
8435 Flush = true;
8436 }
8437 break;
8438 case Write:
8439 case Compare:
8440 case Update:
8443 FlushAO = AtomicOrdering::Release;
8444 Flush = true;
8445 }
8446 break;
8447 case Capture:
8448 switch (AO) {
8450 FlushAO = AtomicOrdering::Acquire;
8451 Flush = true;
8452 break;
8454 FlushAO = AtomicOrdering::Release;
8455 Flush = true;
8456 break;
8460 Flush = true;
8461 break;
8462 default:
8463 // do nothing - leave silently.
8464 break;
8465 }
8466 }
8467
8468 if (Flush) {
8469 // Currently Flush RT call still doesn't take memory_ordering, so for when
8470 // that happens, this tries to do the resolution of which atomic ordering
8471 // to use with but issue the flush call
8472 // TODO: pass `FlushAO` after memory ordering support is added
8473 (void)FlushAO;
8474 emitFlush(Loc);
8475 }
8476
8477 // for AO == AtomicOrdering::Monotonic and all other case combinations
8478 // do nothing
8479 return Flush;
8480}
8481
8485 AtomicOrdering AO) {
8486 if (!updateToLocation(Loc))
8487 return Loc.IP;
8488
8489 assert(X.Var->getType()->isPointerTy() &&
8490 "OMP Atomic expects a pointer to target memory");
8491 Type *XElemTy = X.ElemTy;
8492 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8493 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
8494 "OMP atomic read expected a scalar type");
8495
8496 Value *XRead = nullptr;
8497
8498 if (XElemTy->isIntegerTy()) {
8499 LoadInst *XLD =
8500 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
8501 XLD->setAtomic(AO);
8502 XRead = cast<Value>(XLD);
8503 } else if (XElemTy->isStructTy()) {
8504 // FIXME: Add checks to ensure __atomic_load is emitted iff the
8505 // target does not support `atomicrmw` of the size of the struct
8506 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
8507 OldVal->setAtomic(AO);
8508 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
8509 unsigned LoadSize =
8510 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
8511 OpenMPIRBuilder::AtomicInfo atomicInfo(
8512 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
8513 OldVal->getAlign(), true /* UseLibcall */, X.Var);
8514 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
8515 XRead = AtomicLoadRes.first;
8516 OldVal->eraseFromParent();
8517 } else {
8518 // We need to perform atomic op as integer
8519 IntegerType *IntCastTy =
8521 LoadInst *XLoad =
8522 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
8523 XLoad->setAtomic(AO);
8524 if (XElemTy->isFloatingPointTy()) {
8525 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
8526 } else {
8527 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
8528 }
8529 }
8530 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
8531 if (XRead->getType() != V.Var->getType())
8532 XRead = emitImplicitCast(Builder, XRead, V.Var);
8533 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
8534 return Builder.saveIP();
8535}
8536
8539 AtomicOpValue &X, Value *Expr,
8540 AtomicOrdering AO) {
8541 if (!updateToLocation(Loc))
8542 return Loc.IP;
8543
8544 assert(X.Var->getType()->isPointerTy() &&
8545 "OMP Atomic expects a pointer to target memory");
8546 Type *XElemTy = X.ElemTy;
8547 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8548 XElemTy->isPointerTy()) &&
8549 "OMP atomic write expected a scalar type");
8550
8551 if (XElemTy->isIntegerTy()) {
8552 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
8553 XSt->setAtomic(AO);
8554 } else {
8555 // We need to bitcast and perform atomic op as integers
8556 IntegerType *IntCastTy =
8558 Value *ExprCast =
8559 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
8560 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
8561 XSt->setAtomic(AO);
8562 }
8563
8564 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
8565 return Builder.saveIP();
8566}
8567
8569 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
8570 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
8571 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr) {
8572 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
8573 if (!updateToLocation(Loc))
8574 return Loc.IP;
8575
8576 LLVM_DEBUG({
8577 Type *XTy = X.Var->getType();
8578 assert(XTy->isPointerTy() &&
8579 "OMP Atomic expects a pointer to target memory");
8580 Type *XElemTy = X.ElemTy;
8581 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8582 XElemTy->isPointerTy()) &&
8583 "OMP atomic update expected a scalar type");
8584 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
8585 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
8586 "OpenMP atomic does not support LT or GT operations");
8587 });
8588
8590 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp,
8591 X.IsVolatile, IsXBinopExpr);
8592 if (!AtomicResult)
8593 return AtomicResult.takeError();
8594 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
8595 return Builder.saveIP();
8596}
8597
8598// FIXME: Duplicating AtomicExpand
8599Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
8600 AtomicRMWInst::BinOp RMWOp) {
8601 switch (RMWOp) {
8602 case AtomicRMWInst::Add:
8603 return Builder.CreateAdd(Src1, Src2);
8604 case AtomicRMWInst::Sub:
8605 return Builder.CreateSub(Src1, Src2);
8606 case AtomicRMWInst::And:
8607 return Builder.CreateAnd(Src1, Src2);
8609 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
8610 case AtomicRMWInst::Or:
8611 return Builder.CreateOr(Src1, Src2);
8612 case AtomicRMWInst::Xor:
8613 return Builder.CreateXor(Src1, Src2);
8618 case AtomicRMWInst::Max:
8619 case AtomicRMWInst::Min:
8628 llvm_unreachable("Unsupported atomic update operation");
8629 }
8630 llvm_unreachable("Unsupported atomic update operation");
8631}
8632
8633Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
8634 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
8636 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr) {
8637 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
8638 // or a complex datatype.
8639 bool emitRMWOp = false;
8640 switch (RMWOp) {
8641 case AtomicRMWInst::Add:
8642 case AtomicRMWInst::And:
8644 case AtomicRMWInst::Or:
8645 case AtomicRMWInst::Xor:
8647 emitRMWOp = XElemTy;
8648 break;
8649 case AtomicRMWInst::Sub:
8650 emitRMWOp = (IsXBinopExpr && XElemTy);
8651 break;
8652 default:
8653 emitRMWOp = false;
8654 }
8655 emitRMWOp &= XElemTy->isIntegerTy();
8656
8657 std::pair<Value *, Value *> Res;
8658 if (emitRMWOp) {
8659 Res.first = Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
8660 // not needed except in case of postfix captures. Generate anyway for
8661 // consistency with the else part. Will be removed with any DCE pass.
8662 // AtomicRMWInst::Xchg does not have a coressponding instruction.
8663 if (RMWOp == AtomicRMWInst::Xchg)
8664 Res.second = Res.first;
8665 else
8666 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
8667 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
8668 XElemTy->isStructTy()) {
8669 LoadInst *OldVal =
8670 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
8671 OldVal->setAtomic(AO);
8672 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
8673 unsigned LoadSize =
8674 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
8675
8676 OpenMPIRBuilder::AtomicInfo atomicInfo(
8677 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
8678 OldVal->getAlign(), true /* UseLibcall */, X);
8679 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
8681 Instruction *CurBBTI = CurBB->getTerminator();
8682 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8683 BasicBlock *ExitBB =
8684 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
8685 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
8686 X->getName() + ".atomic.cont");
8687 ContBB->getTerminator()->eraseFromParent();
8688 Builder.restoreIP(AllocaIP);
8689 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
8690 NewAtomicAddr->setName(X->getName() + "x.new.val");
8691 Builder.SetInsertPoint(ContBB);
8692 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
8693 PHI->addIncoming(AtomicLoadRes.first, CurBB);
8694 Value *OldExprVal = PHI;
8695 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
8696 if (!CBResult)
8697 return CBResult.takeError();
8698 Value *Upd = *CBResult;
8699 Builder.CreateStore(Upd, NewAtomicAddr);
8702 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
8703 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
8704 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
8705 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
8706 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
8707 OldVal->eraseFromParent();
8708 Res.first = OldExprVal;
8709 Res.second = Upd;
8710
8711 if (UnreachableInst *ExitTI =
8712 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8713 CurBBTI->eraseFromParent();
8714 Builder.SetInsertPoint(ExitBB);
8715 } else {
8716 Builder.SetInsertPoint(ExitTI);
8717 }
8718 } else {
8719 IntegerType *IntCastTy =
8721 LoadInst *OldVal =
8722 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
8723 OldVal->setAtomic(AO);
8724 // CurBB
8725 // | /---\
8726 // ContBB |
8727 // | \---/
8728 // ExitBB
8730 Instruction *CurBBTI = CurBB->getTerminator();
8731 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8732 BasicBlock *ExitBB =
8733 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
8734 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
8735 X->getName() + ".atomic.cont");
8736 ContBB->getTerminator()->eraseFromParent();
8737 Builder.restoreIP(AllocaIP);
8738 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
8739 NewAtomicAddr->setName(X->getName() + "x.new.val");
8740 Builder.SetInsertPoint(ContBB);
8741 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
8742 PHI->addIncoming(OldVal, CurBB);
8743 bool IsIntTy = XElemTy->isIntegerTy();
8744 Value *OldExprVal = PHI;
8745 if (!IsIntTy) {
8746 if (XElemTy->isFloatingPointTy()) {
8747 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
8748 X->getName() + ".atomic.fltCast");
8749 } else {
8750 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
8751 X->getName() + ".atomic.ptrCast");
8752 }
8753 }
8754
8755 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
8756 if (!CBResult)
8757 return CBResult.takeError();
8758 Value *Upd = *CBResult;
8759 Builder.CreateStore(Upd, NewAtomicAddr);
8760 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
8764 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
8765 Result->setVolatile(VolatileX);
8766 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
8767 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8768 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
8769 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
8770
8771 Res.first = OldExprVal;
8772 Res.second = Upd;
8773
8774 // set Insertion point in exit block
8775 if (UnreachableInst *ExitTI =
8776 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8777 CurBBTI->eraseFromParent();
8778 Builder.SetInsertPoint(ExitBB);
8779 } else {
8780 Builder.SetInsertPoint(ExitTI);
8781 }
8782 }
8783
8784 return Res;
8785}
8786
8788 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
8789 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
8791 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr) {
8792 if (!updateToLocation(Loc))
8793 return Loc.IP;
8794
8795 LLVM_DEBUG({
8796 Type *XTy = X.Var->getType();
8797 assert(XTy->isPointerTy() &&
8798 "OMP Atomic expects a pointer to target memory");
8799 Type *XElemTy = X.ElemTy;
8800 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8801 XElemTy->isPointerTy()) &&
8802 "OMP atomic capture expected a scalar type");
8803 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
8804 "OpenMP atomic does not support LT or GT operations");
8805 });
8806
8807 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
8808 // 'x' is simply atomically rewritten with 'expr'.
8809 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
8811 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp,
8812 X.IsVolatile, IsXBinopExpr);
8813 if (!AtomicResult)
8814 return AtomicResult.takeError();
8815 Value *CapturedVal =
8816 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
8817 if (CapturedVal->getType() != V.Var->getType())
8818 CapturedVal = emitImplicitCast(Builder, CapturedVal, V.Var);
8819 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
8820
8821 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
8822 return Builder.saveIP();
8823}
8824
8828 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
8829 bool IsFailOnly) {
8830
8832 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
8833 IsPostfixUpdate, IsFailOnly, Failure);
8834}
8835
8839 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
8840 bool IsFailOnly, AtomicOrdering Failure) {
8841
8842 if (!updateToLocation(Loc))
8843 return Loc.IP;
8844
8845 assert(X.Var->getType()->isPointerTy() &&
8846 "OMP atomic expects a pointer to target memory");
8847 // compare capture
8848 if (V.Var) {
8849 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
8850 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
8851 }
8852
8853 bool IsInteger = E->getType()->isIntegerTy();
8854
8855 if (Op == OMPAtomicCompareOp::EQ) {
8856 AtomicCmpXchgInst *Result = nullptr;
8857 if (!IsInteger) {
8858 IntegerType *IntCastTy =
8859 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
8860 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
8861 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
8862 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
8863 AO, Failure);
8864 } else {
8865 Result =
8866 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
8867 }
8868
8869 if (V.Var) {
8870 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
8871 if (!IsInteger)
8872 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
8873 assert(OldValue->getType() == V.ElemTy &&
8874 "OldValue and V must be of same type");
8875 if (IsPostfixUpdate) {
8876 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
8877 } else {
8878 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8879 if (IsFailOnly) {
8880 // CurBB----
8881 // | |
8882 // v |
8883 // ContBB |
8884 // | |
8885 // v |
8886 // ExitBB <-
8887 //
8888 // where ContBB only contains the store of old value to 'v'.
8890 Instruction *CurBBTI = CurBB->getTerminator();
8891 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8892 BasicBlock *ExitBB = CurBB->splitBasicBlock(
8893 CurBBTI, X.Var->getName() + ".atomic.exit");
8894 BasicBlock *ContBB = CurBB->splitBasicBlock(
8895 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
8896 ContBB->getTerminator()->eraseFromParent();
8897 CurBB->getTerminator()->eraseFromParent();
8898
8899 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
8900
8901 Builder.SetInsertPoint(ContBB);
8902 Builder.CreateStore(OldValue, V.Var);
8903 Builder.CreateBr(ExitBB);
8904
8905 if (UnreachableInst *ExitTI =
8906 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8907 CurBBTI->eraseFromParent();
8908 Builder.SetInsertPoint(ExitBB);
8909 } else {
8910 Builder.SetInsertPoint(ExitTI);
8911 }
8912 } else {
8913 Value *CapturedValue =
8914 Builder.CreateSelect(SuccessOrFail, E, OldValue);
8915 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
8916 }
8917 }
8918 }
8919 // The comparison result has to be stored.
8920 if (R.Var) {
8921 assert(R.Var->getType()->isPointerTy() &&
8922 "r.var must be of pointer type");
8923 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
8924
8925 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8926 Value *ResultCast = R.IsSigned
8927 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
8928 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
8929 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
8930 }
8931 } else {
8932 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
8933 "Op should be either max or min at this point");
8934 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
8935
8936 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
8937 // Let's take max as example.
8938 // OpenMP form:
8939 // x = x > expr ? expr : x;
8940 // LLVM form:
8941 // *ptr = *ptr > val ? *ptr : val;
8942 // We need to transform to LLVM form.
8943 // x = x <= expr ? x : expr;
8945 if (IsXBinopExpr) {
8946 if (IsInteger) {
8947 if (X.IsSigned)
8948 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
8950 else
8951 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
8953 } else {
8954 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
8956 }
8957 } else {
8958 if (IsInteger) {
8959 if (X.IsSigned)
8960 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
8962 else
8963 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
8965 } else {
8966 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
8968 }
8969 }
8970
8971 AtomicRMWInst *OldValue =
8972 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
8973 if (V.Var) {
8974 Value *CapturedValue = nullptr;
8975 if (IsPostfixUpdate) {
8976 CapturedValue = OldValue;
8977 } else {
8978 CmpInst::Predicate Pred;
8979 switch (NewOp) {
8980 case AtomicRMWInst::Max:
8981 Pred = CmpInst::ICMP_SGT;
8982 break;
8984 Pred = CmpInst::ICMP_UGT;
8985 break;
8987 Pred = CmpInst::FCMP_OGT;
8988 break;
8989 case AtomicRMWInst::Min:
8990 Pred = CmpInst::ICMP_SLT;
8991 break;
8993 Pred = CmpInst::ICMP_ULT;
8994 break;
8996 Pred = CmpInst::FCMP_OLT;
8997 break;
8998 default:
8999 llvm_unreachable("unexpected comparison op");
9000 }
9001 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
9002 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
9003 }
9004 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
9005 }
9006 }
9007
9008 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
9009
9010 return Builder.saveIP();
9011}
9012
9015 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
9016 Value *NumTeamsUpper, Value *ThreadLimit,
9017 Value *IfExpr) {
9018 if (!updateToLocation(Loc))
9019 return InsertPointTy();
9020
9021 uint32_t SrcLocStrSize;
9022 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
9023 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
9024 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
9025
9026 // Outer allocation basicblock is the entry block of the current function.
9027 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
9028 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
9029 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
9030 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
9031 }
9032
9033 // The current basic block is split into four basic blocks. After outlining,
9034 // they will be mapped as follows:
9035 // ```
9036 // def current_fn() {
9037 // current_basic_block:
9038 // br label %teams.exit
9039 // teams.exit:
9040 // ; instructions after teams
9041 // }
9042 //
9043 // def outlined_fn() {
9044 // teams.alloca:
9045 // br label %teams.body
9046 // teams.body:
9047 // ; instructions within teams body
9048 // }
9049 // ```
9050 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
9051 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
9052 BasicBlock *AllocaBB =
9053 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
9054
9055 bool SubClausesPresent =
9056 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
9057 // Push num_teams
9058 if (!Config.isTargetDevice() && SubClausesPresent) {
9059 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
9060 "if lowerbound is non-null, then upperbound must also be non-null "
9061 "for bounds on num_teams");
9062
9063 if (NumTeamsUpper == nullptr)
9064 NumTeamsUpper = Builder.getInt32(0);
9065
9066 if (NumTeamsLower == nullptr)
9067 NumTeamsLower = NumTeamsUpper;
9068
9069 if (IfExpr) {
9070 assert(IfExpr->getType()->isIntegerTy() &&
9071 "argument to if clause must be an integer value");
9072
9073 // upper = ifexpr ? upper : 1
9074 if (IfExpr->getType() != Int1)
9075 IfExpr = Builder.CreateICmpNE(IfExpr,
9076 ConstantInt::get(IfExpr->getType(), 0));
9077 NumTeamsUpper = Builder.CreateSelect(
9078 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
9079
9080 // lower = ifexpr ? lower : 1
9081 NumTeamsLower = Builder.CreateSelect(
9082 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
9083 }
9084
9085 if (ThreadLimit == nullptr)
9086 ThreadLimit = Builder.getInt32(0);
9087
9088 Value *ThreadNum = getOrCreateThreadID(Ident);
9090 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
9091 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
9092 }
9093 // Generate the body of teams.
9094 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
9095 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
9096 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
9097 return Err;
9098
9099 OutlineInfo OI;
9100 OI.EntryBB = AllocaBB;
9101 OI.ExitBB = ExitBB;
9102 OI.OuterAllocaBB = &OuterAllocaBB;
9103
9104 // Insert fake values for global tid and bound tid.
9106 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
9108 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
9110 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
9111
9112 auto HostPostOutlineCB = [this, Ident,
9113 ToBeDeleted](Function &OutlinedFn) mutable {
9114 // The stale call instruction will be replaced with a new call instruction
9115 // for runtime call with the outlined function.
9116
9117 assert(OutlinedFn.getNumUses() == 1 &&
9118 "there must be a single user for the outlined function");
9119 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
9120 ToBeDeleted.push_back(StaleCI);
9121
9122 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
9123 "Outlined function must have two or three arguments only");
9124
9125 bool HasShared = OutlinedFn.arg_size() == 3;
9126
9127 OutlinedFn.getArg(0)->setName("global.tid.ptr");
9128 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
9129 if (HasShared)
9130 OutlinedFn.getArg(2)->setName("data");
9131
9132 // Call to the runtime function for teams in the current function.
9133 assert(StaleCI && "Error while outlining - no CallInst user found for the "
9134 "outlined function.");
9135 Builder.SetInsertPoint(StaleCI);
9136 SmallVector<Value *> Args = {
9137 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
9138 if (HasShared)
9139 Args.push_back(StaleCI->getArgOperand(2));
9141 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
9142 Args);
9143
9144 for (Instruction *I : llvm::reverse(ToBeDeleted))
9145 I->eraseFromParent();
9146 };
9147
9148 if (!Config.isTargetDevice())
9149 OI.PostOutlineCB = HostPostOutlineCB;
9150
9151 addOutlineInfo(std::move(OI));
9152
9153 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
9154
9155 return Builder.saveIP();
9156}
9157
9160 std::string VarName) {
9161 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
9163 Names.size()),
9164 Names);
9165 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
9166 M, MapNamesArrayInit->getType(),
9167 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
9168 VarName);
9169 return MapNamesArrayGlobal;
9170}
9171
9172// Create all simple and struct types exposed by the runtime and remember
9173// the llvm::PointerTypes of them for easy access later.
9174void OpenMPIRBuilder::initializeTypes(Module &M) {
9175 LLVMContext &Ctx = M.getContext();
9176 StructType *T;
9177#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
9178#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
9179 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
9180 VarName##PtrTy = PointerType::getUnqual(VarName##Ty);
9181#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
9182 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
9183 VarName##Ptr = PointerType::getUnqual(VarName);
9184#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
9185 T = StructType::getTypeByName(Ctx, StructName); \
9186 if (!T) \
9187 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
9188 VarName = T; \
9189 VarName##Ptr = PointerType::getUnqual(T);
9190#include "llvm/Frontend/OpenMP/OMPKinds.def"
9191}
9192
9195 SmallVectorImpl<BasicBlock *> &BlockVector) {
9197 BlockSet.insert(EntryBB);
9198 BlockSet.insert(ExitBB);
9199
9200 Worklist.push_back(EntryBB);
9201 while (!Worklist.empty()) {
9202 BasicBlock *BB = Worklist.pop_back_val();
9203 BlockVector.push_back(BB);
9204 for (BasicBlock *SuccBB : successors(BB))
9205 if (BlockSet.insert(SuccBB).second)
9206 Worklist.push_back(SuccBB);
9207 }
9208}
9209
9211 uint64_t Size, int32_t Flags,
9213 StringRef Name) {
9214 if (!Config.isGPU()) {
9216 M, ID, Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0,
9217 "omp_offloading_entries");
9218 return;
9219 }
9220 // TODO: Add support for global variables on the device after declare target
9221 // support.
9222 Function *Fn = dyn_cast<Function>(Addr);
9223 if (!Fn)
9224 return;
9225
9226 Module &M = *(Fn->getParent());
9227 LLVMContext &Ctx = M.getContext();
9228
9229 // Get "nvvm.annotations" metadata node.
9230 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
9231
9232 Metadata *MDVals[] = {
9233 ConstantAsMetadata::get(Fn), MDString::get(Ctx, "kernel"),
9234 ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Ctx), 1))};
9235 // Append metadata to nvvm.annotations.
9236 MD->addOperand(MDNode::get(Ctx, MDVals));
9237
9238 // Add a function attribute for the kernel.
9239 Fn->addFnAttr(Attribute::get(Ctx, "kernel"));
9240 if (T.isAMDGCN())
9241 Fn->addFnAttr("uniform-work-group-size", "true");
9242 Fn->addFnAttr(Attribute::MustProgress);
9243}
9244
9245// We only generate metadata for function that contain target regions.
9248
9249 // If there are no entries, we don't need to do anything.
9251 return;
9252
9256 16>
9257 OrderedEntries(OffloadInfoManager.size());
9258
9259 // Auxiliary methods to create metadata values and strings.
9260 auto &&GetMDInt = [this](unsigned V) {
9261 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
9262 };
9263
9264 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
9265
9266 // Create the offloading info metadata node.
9267 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
9268 auto &&TargetRegionMetadataEmitter =
9269 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
9270 const TargetRegionEntryInfo &EntryInfo,
9272 // Generate metadata for target regions. Each entry of this metadata
9273 // contains:
9274 // - Entry 0 -> Kind of this type of metadata (0).
9275 // - Entry 1 -> Device ID of the file where the entry was identified.
9276 // - Entry 2 -> File ID of the file where the entry was identified.
9277 // - Entry 3 -> Mangled name of the function where the entry was
9278 // identified.
9279 // - Entry 4 -> Line in the file where the entry was identified.
9280 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
9281 // - Entry 6 -> Order the entry was created.
9282 // The first element of the metadata node is the kind.
9283 Metadata *Ops[] = {
9284 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
9285 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
9286 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
9287 GetMDInt(E.getOrder())};
9288
9289 // Save this entry in the right position of the ordered entries array.
9290 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
9291
9292 // Add metadata to the named metadata node.
9293 MD->addOperand(MDNode::get(C, Ops));
9294 };
9295
9296 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
9297
9298 // Create function that emits metadata for each device global variable entry;
9299 auto &&DeviceGlobalVarMetadataEmitter =
9300 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
9301 StringRef MangledName,
9303 // Generate metadata for global variables. Each entry of this metadata
9304 // contains:
9305 // - Entry 0 -> Kind of this type of metadata (1).
9306 // - Entry 1 -> Mangled name of the variable.
9307 // - Entry 2 -> Declare target kind.
9308 // - Entry 3 -> Order the entry was created.
9309 // The first element of the metadata node is the kind.
9310 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
9311 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
9312
9313 // Save this entry in the right position of the ordered entries array.
9314 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
9315 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
9316
9317 // Add metadata to the named metadata node.
9318 MD->addOperand(MDNode::get(C, Ops));
9319 };
9320
9322 DeviceGlobalVarMetadataEmitter);
9323
9324 for (const auto &E : OrderedEntries) {
9325 assert(E.first && "All ordered entries must exist!");
9326 if (const auto *CE =
9327 dyn_cast<OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion>(
9328 E.first)) {
9329 if (!CE->getID() || !CE->getAddress()) {
9330 // Do not blame the entry if the parent funtion is not emitted.
9331 TargetRegionEntryInfo EntryInfo = E.second;
9332 StringRef FnName = EntryInfo.ParentName;
9333 if (!M.getNamedValue(FnName))
9334 continue;
9335 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
9336 continue;
9337 }
9338 createOffloadEntry(CE->getID(), CE->getAddress(),
9339 /*Size=*/0, CE->getFlags(),
9341 } else if (const auto *CE = dyn_cast<
9343 E.first)) {
9346 CE->getFlags());
9347 switch (Flags) {
9351 continue;
9352 if (!CE->getAddress()) {
9353 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
9354 continue;
9355 }
9356 // The vaiable has no definition - no need to add the entry.
9357 if (CE->getVarSize() == 0)
9358 continue;
9359 break;
9361 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
9362 (!Config.isTargetDevice() && CE->getAddress())) &&
9363 "Declaret target link address is set.");
9364 if (Config.isTargetDevice())
9365 continue;
9366 if (!CE->getAddress()) {
9368 continue;
9369 }
9370 break;
9371 default:
9372 break;
9373 }
9374
9375 // Hidden or internal symbols on the device are not externally visible.
9376 // We should not attempt to register them by creating an offloading
9377 // entry. Indirect variables are handled separately on the device.
9378 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
9379 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
9381 continue;
9382
9383 // Indirect globals need to use a special name that doesn't match the name
9384 // of the associated host global.
9386 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
9387 Flags, CE->getLinkage(), CE->getVarName());
9388 else
9389 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
9390 Flags, CE->getLinkage());
9391
9392 } else {
9393 llvm_unreachable("Unsupported entry kind.");
9394 }
9395 }
9396
9397 // Emit requires directive globals to a special entry so the runtime can
9398 // register them when the device image is loaded.
9399 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
9400 // entries should be redesigned to better suit this use-case.
9404 /*Name=*/"",
9406 Config.getRequiresFlags(), "omp_offloading_entries");
9407}
9408
9410 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
9411 unsigned FileID, unsigned Line, unsigned Count) {
9413 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
9414 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
9415 if (Count)
9416 OS << "_" << Count;
9417}
9418
9421 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
9423 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
9424 EntryInfo.Line, NewCount);
9425}
9426
9429 StringRef ParentName) {
9431 auto FileIDInfo = CallBack();
9432 if (auto EC = sys::fs::getUniqueID(std::get<0>(FileIDInfo), ID)) {
9433 report_fatal_error(("Unable to get unique ID for file, during "
9434 "getTargetEntryUniqueInfo, error message: " +
9435 EC.message())
9436 .c_str());
9437 }
9438
9439 return TargetRegionEntryInfo(ParentName, ID.getDevice(), ID.getFile(),
9440 std::get<1>(FileIDInfo));
9441}
9442
9444 unsigned Offset = 0;
9445 for (uint64_t Remain =
9446 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
9448 !(Remain & 1); Remain = Remain >> 1)
9449 Offset++;
9450 return Offset;
9451}
9452
9455 // Rotate by getFlagMemberOffset() bits.
9456 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
9457 << getFlagMemberOffset());
9458}
9459
9462 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
9463 // If the entry is PTR_AND_OBJ but has not been marked with the special
9464 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
9465 // marked as MEMBER_OF.
9466 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
9468 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
9471 return;
9472
9473 // Reset the placeholder value to prepare the flag for the assignment of the
9474 // proper MEMBER_OF value.
9475 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
9476 Flags |= MemberOfFlag;
9477}
9478
9482 bool IsDeclaration, bool IsExternallyVisible,
9483 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
9484 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
9485 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
9486 std::function<Constant *()> GlobalInitializer,
9487 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
9488 // TODO: convert this to utilise the IRBuilder Config rather than
9489 // a passed down argument.
9490 if (OpenMPSIMD)
9491 return nullptr;
9492
9495 CaptureClause ==
9498 SmallString<64> PtrName;
9499 {
9500 raw_svector_ostream OS(PtrName);
9501 OS << MangledName;
9502 if (!IsExternallyVisible)
9503 OS << format("_%x", EntryInfo.FileID);
9504 OS << "_decl_tgt_ref_ptr";
9505 }
9506
9507 Value *Ptr = M.getNamedValue(PtrName);
9508
9509 if (!Ptr) {
9510 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
9511 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
9512
9513 auto *GV = cast<GlobalVariable>(Ptr);
9514 GV->setLinkage(GlobalValue::WeakAnyLinkage);
9515
9516 if (!Config.isTargetDevice()) {
9517 if (GlobalInitializer)
9518 GV->setInitializer(GlobalInitializer());
9519 else
9520 GV->setInitializer(GlobalValue);
9521 }
9522
9524 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
9525 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
9526 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
9527 }
9528
9529 return cast<Constant>(Ptr);
9530 }
9531
9532 return nullptr;
9533}
9534
9538 bool IsDeclaration, bool IsExternallyVisible,
9539 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
9540 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
9541 std::vector<Triple> TargetTriple,
9542 std::function<Constant *()> GlobalInitializer,
9543 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
9544 Constant *Addr) {
9546 (TargetTriple.empty() && !Config.isTargetDevice()))
9547 return;
9548
9550 StringRef VarName;
9551 int64_t VarSize;
9553
9555 CaptureClause ==
9559 VarName = MangledName;
9560 GlobalValue *LlvmVal = M.getNamedValue(VarName);
9561
9562 if (!IsDeclaration)
9563 VarSize = divideCeil(
9565 else
9566 VarSize = 0;
9567 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
9568
9569 // This is a workaround carried over from Clang which prevents undesired
9570 // optimisation of internal variables.
9571 if (Config.isTargetDevice() &&
9572 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
9573 // Do not create a "ref-variable" if the original is not also available
9574 // on the host.
9576 return;
9577
9578 std::string RefName = createPlatformSpecificName({VarName, "ref"});
9579
9580 if (!M.getNamedValue(RefName)) {
9581 Constant *AddrRef =
9582 getOrCreateInternalVariable(Addr->getType(), RefName);
9583 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
9584 GvAddrRef->setConstant(true);
9585 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
9586 GvAddrRef->setInitializer(Addr);
9587 GeneratedRefs.push_back(GvAddrRef);
9588 }
9589 }
9590 } else {
9593 else
9595
9596 if (Config.isTargetDevice()) {
9597 VarName = (Addr) ? Addr->getName() : "";
9598 Addr = nullptr;
9599 } else {
9601 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
9602 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
9603 LlvmPtrTy, GlobalInitializer, VariableLinkage);
9604 VarName = (Addr) ? Addr->getName() : "";
9605 }
9606 VarSize = M.getDataLayout().getPointerSize();
9608 }
9609
9611 Flags, Linkage);
9612}
9613
9614/// Loads all the offload entries information from the host IR
9615/// metadata.
9617 // If we are in target mode, load the metadata from the host IR. This code has
9618 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
9619
9621 if (!MD)
9622 return;
9623
9624 for (MDNode *MN : MD->operands()) {
9625 auto &&GetMDInt = [MN](unsigned Idx) {
9626 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
9627 return cast<ConstantInt>(V->getValue())->getZExtValue();
9628 };
9629
9630 auto &&GetMDString = [MN](unsigned Idx) {
9631 auto *V = cast<MDString>(MN->getOperand(Idx));
9632 return V->getString();
9633 };
9634
9635 switch (GetMDInt(0)) {
9636 default:
9637 llvm_unreachable("Unexpected metadata!");
9638 break;
9641 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
9642 /*DeviceID=*/GetMDInt(1),
9643 /*FileID=*/GetMDInt(2),
9644 /*Line=*/GetMDInt(4),
9645 /*Count=*/GetMDInt(5));
9647 /*Order=*/GetMDInt(6));
9648 break;
9649 }
9653 /*MangledName=*/GetMDString(1),
9655 /*Flags=*/GetMDInt(2)),
9656 /*Order=*/GetMDInt(3));
9657 break;
9658 }
9659 }
9660}
9661
9663 if (HostFilePath.empty())
9664 return;
9665
9666 auto Buf = MemoryBuffer::getFile(HostFilePath);
9667 if (std::error_code Err = Buf.getError()) {
9668 report_fatal_error(("error opening host file from host file path inside of "
9669 "OpenMPIRBuilder: " +
9670 Err.message())
9671 .c_str());
9672 }
9673
9674 LLVMContext Ctx;
9676 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
9677 if (std::error_code Err = M.getError()) {
9679 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
9680 .c_str());
9681 }
9682
9683 loadOffloadInfoMetadata(*M.get());
9684}
9685
9686//===----------------------------------------------------------------------===//
9687// OffloadEntriesInfoManager
9688//===----------------------------------------------------------------------===//
9689
9691 return OffloadEntriesTargetRegion.empty() &&
9692 OffloadEntriesDeviceGlobalVar.empty();
9693}
9694
9695unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
9696 const TargetRegionEntryInfo &EntryInfo) const {
9697 auto It = OffloadEntriesTargetRegionCount.find(
9698 getTargetRegionEntryCountKey(EntryInfo));
9699 if (It == OffloadEntriesTargetRegionCount.end())
9700 return 0;
9701 return It->second;
9702}
9703
9704void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
9705 const TargetRegionEntryInfo &EntryInfo) {
9706 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
9707 EntryInfo.Count + 1;
9708}
9709
9710/// Initialize target region entry.
9712 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
9713 OffloadEntriesTargetRegion[EntryInfo] =
9714 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
9715 OMPTargetRegionEntryTargetRegion);
9716 ++OffloadingEntriesNum;
9717}
9718
9722 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
9723
9724 // Update the EntryInfo with the next available count for this location.
9725 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
9726
9727 // If we are emitting code for a target, the entry is already initialized,
9728 // only has to be registered.
9729 if (OMPBuilder->Config.isTargetDevice()) {
9730 // This could happen if the device compilation is invoked standalone.
9731 if (!hasTargetRegionEntryInfo(EntryInfo)) {
9732 return;
9733 }
9734 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
9735 Entry.setAddress(Addr);
9736 Entry.setID(ID);
9737 Entry.setFlags(Flags);
9738 } else {
9740 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
9741 return;
9742 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
9743 "Target region entry already registered!");
9744 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
9745 OffloadEntriesTargetRegion[EntryInfo] = Entry;
9746 ++OffloadingEntriesNum;
9747 }
9748 incrementTargetRegionEntryInfoCount(EntryInfo);
9749}
9750
9752 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
9753
9754 // Update the EntryInfo with the next available count for this location.
9755 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
9756
9757 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
9758 if (It == OffloadEntriesTargetRegion.end()) {
9759 return false;
9760 }
9761 // Fail if this entry is already registered.
9762 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
9763 return false;
9764 return true;
9765}
9766
9768 const OffloadTargetRegionEntryInfoActTy &Action) {
9769 // Scan all target region entries and perform the provided action.
9770 for (const auto &It : OffloadEntriesTargetRegion) {
9771 Action(It.first, It.second);
9772 }
9773}
9774
9776 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
9777 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
9778 ++OffloadingEntriesNum;
9779}
9780
9782 StringRef VarName, Constant *Addr, int64_t VarSize,
9784 if (OMPBuilder->Config.isTargetDevice()) {
9785 // This could happen if the device compilation is invoked standalone.
9786 if (!hasDeviceGlobalVarEntryInfo(VarName))
9787 return;
9788 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
9789 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
9790 if (Entry.getVarSize() == 0) {
9791 Entry.setVarSize(VarSize);
9792 Entry.setLinkage(Linkage);
9793 }
9794 return;
9795 }
9796 Entry.setVarSize(VarSize);
9797 Entry.setLinkage(Linkage);
9798 Entry.setAddress(Addr);
9799 } else {
9800 if (hasDeviceGlobalVarEntryInfo(VarName)) {
9801 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
9802 assert(Entry.isValid() && Entry.getFlags() == Flags &&
9803 "Entry not initialized!");
9804 if (Entry.getVarSize() == 0) {
9805 Entry.setVarSize(VarSize);
9806 Entry.setLinkage(Linkage);
9807 }
9808 return;
9809 }
9811 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
9812 Addr, VarSize, Flags, Linkage,
9813 VarName.str());
9814 else
9815 OffloadEntriesDeviceGlobalVar.try_emplace(
9816 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
9817 ++OffloadingEntriesNum;
9818 }
9819}
9820
9823 // Scan all target region entries and perform the provided action.
9824 for (const auto &E : OffloadEntriesDeviceGlobalVar)
9825 Action(E.getKey(), E.getValue());
9826}
9827
9828//===----------------------------------------------------------------------===//
9829// CanonicalLoopInfo
9830//===----------------------------------------------------------------------===//
9831
9832void CanonicalLoopInfo::collectControlBlocks(
9834 // We only count those BBs as control block for which we do not need to
9835 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
9836 // flow. For consistency, this also means we do not add the Body block, which
9837 // is just the entry to the body code.
9838 BBs.reserve(BBs.size() + 6);
9839 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
9840}
9841
9843 assert(isValid() && "Requires a valid canonical loop");
9844 for (BasicBlock *Pred : predecessors(Header)) {
9845 if (Pred != Latch)
9846 return Pred;
9847 }
9848 llvm_unreachable("Missing preheader");
9849}
9850
9851void CanonicalLoopInfo::setTripCount(Value *TripCount) {
9852 assert(isValid() && "Requires a valid canonical loop");
9853
9854 Instruction *CmpI = &getCond()->front();
9855 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
9856 CmpI->setOperand(1, TripCount);
9857
9858#ifndef NDEBUG
9859 assertOK();
9860#endif
9861}
9862
9863void CanonicalLoopInfo::mapIndVar(
9864 llvm::function_ref<Value *(Instruction *)> Updater) {
9865 assert(isValid() && "Requires a valid canonical loop");
9866
9867 Instruction *OldIV = getIndVar();
9868
9869 // Record all uses excluding those introduced by the updater. Uses by the
9870 // CanonicalLoopInfo itself to keep track of the number of iterations are
9871 // excluded.
9872 SmallVector<Use *> ReplacableUses;
9873 for (Use &U : OldIV->uses()) {
9874 auto *User = dyn_cast<Instruction>(U.getUser());
9875 if (!User)
9876 continue;
9877 if (User->getParent() == getCond())
9878 continue;
9879 if (User->getParent() == getLatch())
9880 continue;
9881 ReplacableUses.push_back(&U);
9882 }
9883
9884 // Run the updater that may introduce new uses
9885 Value *NewIV = Updater(OldIV);
9886
9887 // Replace the old uses with the value returned by the updater.
9888 for (Use *U : ReplacableUses)
9889 U->set(NewIV);
9890
9891#ifndef NDEBUG
9892 assertOK();
9893#endif
9894}
9895
9897#ifndef NDEBUG
9898 // No constraints if this object currently does not describe a loop.
9899 if (!isValid())
9900 return;
9901
9902 BasicBlock *Preheader = getPreheader();
9903 BasicBlock *Body = getBody();
9904 BasicBlock *After = getAfter();
9905
9906 // Verify standard control-flow we use for OpenMP loops.
9907 assert(Preheader);
9908 assert(isa<BranchInst>(Preheader->getTerminator()) &&
9909 "Preheader must terminate with unconditional branch");
9910 assert(Preheader->getSingleSuccessor() == Header &&
9911 "Preheader must jump to header");
9912
9913 assert(Header);
9914 assert(isa<BranchInst>(Header->getTerminator()) &&
9915 "Header must terminate with unconditional branch");
9916 assert(Header->getSingleSuccessor() == Cond &&
9917 "Header must jump to exiting block");
9918
9919 assert(Cond);
9920 assert(Cond->getSinglePredecessor() == Header &&
9921 "Exiting block only reachable from header");
9922
9923 assert(isa<BranchInst>(Cond->getTerminator()) &&
9924 "Exiting block must terminate with conditional branch");
9925 assert(size(successors(Cond)) == 2 &&
9926 "Exiting block must have two successors");
9927 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
9928 "Exiting block's first successor jump to the body");
9929 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
9930 "Exiting block's second successor must exit the loop");
9931
9932 assert(Body);
9933 assert(Body->getSinglePredecessor() == Cond &&
9934 "Body only reachable from exiting block");
9935 assert(!isa<PHINode>(Body->front()));
9936
9937 assert(Latch);
9938 assert(isa<BranchInst>(Latch->getTerminator()) &&
9939 "Latch must terminate with unconditional branch");
9940 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
9941 // TODO: To support simple redirecting of the end of the body code that has
9942 // multiple; introduce another auxiliary basic block like preheader and after.
9943 assert(Latch->getSinglePredecessor() != nullptr);
9944 assert(!isa<PHINode>(Latch->front()));
9945
9946 assert(Exit);
9947 assert(isa<BranchInst>(Exit->getTerminator()) &&
9948 "Exit block must terminate with unconditional branch");
9949 assert(Exit->getSingleSuccessor() == After &&
9950 "Exit block must jump to after block");
9951
9952 assert(After);
9953 assert(After->getSinglePredecessor() == Exit &&
9954 "After block only reachable from exit block");
9955 assert(After->empty() || !isa<PHINode>(After->front()));
9956
9957 Instruction *IndVar = getIndVar();
9958 assert(IndVar && "Canonical induction variable not found?");
9959 assert(isa<IntegerType>(IndVar->getType()) &&
9960 "Induction variable must be an integer");
9961 assert(cast<PHINode>(IndVar)->getParent() == Header &&
9962 "Induction variable must be a PHI in the loop header");
9963 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
9964 assert(
9965 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
9966 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
9967
9968 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
9969 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
9970 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
9971 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
9972 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
9973 ->isOne());
9974
9975 Value *TripCount = getTripCount();
9976 assert(TripCount && "Loop trip count not found?");
9977 assert(IndVar->getType() == TripCount->getType() &&
9978 "Trip count and induction variable must have the same type");
9979
9980 auto *CmpI = cast<CmpInst>(&Cond->front());
9981 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
9982 "Exit condition must be a signed less-than comparison");
9983 assert(CmpI->getOperand(0) == IndVar &&
9984 "Exit condition must compare the induction variable");
9985 assert(CmpI->getOperand(1) == TripCount &&
9986 "Exit condition must compare with the trip count");
9987#endif
9988}
9989
9991 Header = nullptr;
9992 Cond = nullptr;
9993 Latch = nullptr;
9994 Exit = nullptr;
9995}
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE() pulls the operator overloads used by LLVM_MARK_AS_BITMASK_EN...
Definition: BitmaskEnum.h:83
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:42
BlockVerifier::State From
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition: IVUsers.cpp:48
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:557
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file contains the declarations for metadata subclasses.
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Definition: OMPConstants.h:75
Provides definitions for Target specific Grid Values.
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI)
Create an entry point for a target task with the following.
static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause)
Determine the schedule type using schedule and ordering clause arguments.
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static MDNode * getNVPTXMDNode(Function &Kernel, StringRef Name)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static llvm::Value * emitImplicitCast(IRBuilder<> &Builder, llvm::Value *XRead, llvm::Value *V)
Emit an implicit cast to convert XRead to type of variable V.
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, SmallVector< llvm::OpenMPIRBuilder::DependData > Dependencies={}, bool HasNoWait=false)
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, Type *ParallelTaskPtr, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
unsigned unsigned DefaultVal
raw_pwrite_stream & OS
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Definition: Instructions.h:63
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:124
PointerType * getType() const
Overload to return most specific pointer type.
Definition: Instructions.h:99
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:117
unsigned getAddressSpace() const
Return the address space for the allocation.
Definition: Instructions.h:104
std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
Definition: Instructions.h:128
const Value * getArraySize() const
Get the number of elements allocated.
Definition: Instructions.h:95
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
Definition: PassManager.h:471
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:157
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
Class to represent array types.
Definition: DerivedTypes.h:395
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:501
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:652
std::pair< LoadInst *, AllocaInst * > EmitAtomicLoadLibcall(AtomicOrdering AO)
Definition: Atomic.cpp:107
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:704
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:716
@ Add
*p = old + v
Definition: Instructions.h:720
@ FAdd
*p = old + v
Definition: Instructions.h:741
@ USubCond
Subtract only if no unsigned overflow.
Definition: Instructions.h:764
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:734
@ Or
*p = old | v
Definition: Instructions.h:728
@ Sub
*p = old - v
Definition: Instructions.h:722
@ And
*p = old & v
Definition: Instructions.h:724
@ Xor
*p = old ^ v
Definition: Instructions.h:730
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
Definition: Instructions.h:768
@ FSub
*p = old - v
Definition: Instructions.h:744
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:756
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:732
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:738
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:752
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:736
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:748
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:760
@ Nand
*p = ~(old & v)
Definition: Instructions.h:726
AttrBuilder & addAttribute(Attribute::AttrKind Val)
Add an attribute to the builder.
AttrBuilder & removeAttribute(Attribute::AttrKind Val)
Remove an attribute from the builder.
AttributeSet getFnAttrs() const
The function attributes are returned.
AttributeList addFnAttributes(LLVMContext &C, const AttrBuilder &B) const
Add function attribute to the list.
Definition: Attributes.h:599
AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
Definition: Attributes.cpp:933
AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
Definition: Attributes.cpp:918
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:95
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:392
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
Definition: BasicBlock.cpp:662
iterator end()
Definition: BasicBlock.h:461
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:416
reverse_iterator rbegin()
Definition: BasicBlock.h:464
bool empty() const
Definition: BasicBlock.h:470
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:367
const Instruction & front() const
Definition: BasicBlock.h:471
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:212
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:577
const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
Definition: BasicBlock.cpp:497
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:459
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:179
const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
Definition: BasicBlock.cpp:467
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:489
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
Definition: BasicBlock.cpp:279
reverse_iterator rend()
Definition: BasicBlock.h:466
const Instruction * getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
Definition: BasicBlock.cpp:386
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition: BasicBlock.h:376
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition: BasicBlock.h:631
const Instruction & back() const
Definition: BasicBlock.h:473
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:292
void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Definition: BasicBlock.cpp:516
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
void setDoesNotThrow()
Definition: InstrTypes.h:1926
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition: InstrTypes.h:1261
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Definition: InstrTypes.h:1267
unsigned arg_size() const
Definition: InstrTypes.h:1284
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:679
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:677
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
A cache for the CodeExtractor analysis.
Definition: CodeExtractor.h:46
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:85
void findAllocas(const CodeExtractorAnalysisCache &CEAC, ValueSet &SinkCands, ValueSet &HoistCands, BasicBlock *&ExitBlock) const
Find the set of allocas whose life ranges are contained within the outlined region.
Function * extractCodeRegion(const CodeExtractorAnalysisCache &CEAC)
Perform the extraction, returning the new function.
void findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, const ValueSet &Allocas, bool CollectGlobalInputs=false) const
Compute the set of input values and output values for the code.
bool isEligible() const
Test whether this code extractor is eligible.
void excludeArgFromAggregate(Value *Arg)
Exclude a value from aggregate argument passing when extracting a code region, passing it instead as ...
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1312
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:532
static Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
Definition: Constants.cpp:2990
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition: Constants.h:709
static Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
Definition: Constants.cpp:2253
static Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
Definition: Constants.cpp:2268
static Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2333
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:866
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.h:126
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:873
static ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
Definition: Constants.cpp:1826
static Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1378
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
DISubprogram * getSubprogram() const
Get the subprogram for this scope.
Debug location.
Subprogram description.
DISPFlags
Debug info subprogram flags.
Type array for a subprogram.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
unsigned getDefaultGlobalsAddressSpace() const
Definition: DataLayout.h:247
Align getABIIntegerTypeAlignment(unsigned BitWidth) const
Returns the minimum ABI-required alignment for an integer type of the specified bitwidth.
Definition: DataLayout.h:486
unsigned getAllocaAddrSpace() const
Definition: DataLayout.h:229
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
unsigned getPointerSize(unsigned AS=0) const
Layout pointer size in bytes, rounded up to a whole number of bytes.
Definition: DataLayout.cpp:739
unsigned getIndexSizeInBits(unsigned AS) const
Size in bits of index used for address calculation in getelementptr.
Definition: DataLayout.h:369
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:421
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:847
A debug info location.
Definition: DebugLoc.h:33
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Definition: Dominators.cpp:371
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Lightweight error class with error context and mandatory checking.
Definition: Error.h:160
static ErrorSuccess success()
Create a success value.
Definition: Error.h:337
Tagged union holding either a T or a Error.
Definition: Error.h:481
Error takeError()
Take ownership of the stored error.
Definition: Error.h:608
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:170
Class to represent function types.
Definition: DerivedTypes.h:105
static FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition: Function.cpp:641
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition: Function.h:173
const BasicBlock & getEntryBlock() const
Definition: Function.h:809
bool empty() const
Definition: Function.h:859
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition: Function.cpp:454
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:373
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:766
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:778
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:353
const Function & getFunction() const
Definition: Function.h:171
iterator begin()
Definition: Function.h:853
arg_iterator arg_begin()
Definition: Function.h:868
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition: Function.h:356
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition: Function.cpp:669
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition: Function.h:754
size_t arg_size() const
Definition: Function.h:901
Type * getReturnType() const
Returns the type of the ret val.
Definition: Function.h:221
iterator end()
Definition: Function.h:855
void setCallingConv(CallingConv::ID CC)
Definition: Function.h:281
Argument * getArg(unsigned i) const
Definition: Function.h:886
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition: Value.h:589
void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
Definition: Metadata.cpp:1565
LinkageTypes getLinkage() const
Definition: GlobalValue.h:547
void setLinkage(LinkageTypes LT)
Definition: GlobalValue.h:538
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:657
void setDSOLocal(bool Local)
Definition: GlobalValue.h:304
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:295
@ HiddenVisibility
The GV is hidden.
Definition: GlobalValue.h:68
@ ProtectedVisibility
The GV is protected.
Definition: GlobalValue.h:69
void setVisibility(VisibilityTypes V)
Definition: GlobalValue.h:255
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition: GlobalValue.h:51
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:60
@ CommonLinkage
Tentative definitions.
Definition: GlobalValue.h:62
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:59
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:57
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition: GlobalValue.h:56
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition: GlobalValue.h:58
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:55
Type * getValueType() const
Definition: GlobalValue.h:297
InsertPoint - A saved insertion point.
Definition: IRBuilder.h:276
BasicBlock * getBlock() const
Definition: IRBuilder.h:291
bool isSet() const
Returns true if this insert point is set.
Definition: IRBuilder.h:289
BasicBlock::iterator getPoint() const
Definition: IRBuilder.h:292
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:113
Value * CreateNUWMul(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1417
Value * CreatePtrDiff(Type *ElemTy, Value *LHS, Value *RHS, const Twine &Name="")
Return the i64 difference between two pointer values, dividing out the size of the pointed-to objects...
Definition: IRBuilder.cpp:1075
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2286
AtomicCmpXchgInst * CreateAtomicCmpXchg(Value *Ptr, Value *Cmp, Value *New, MaybeAlign Align, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1849
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1887
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
Definition: IRBuilder.h:1781
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2562
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2106
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:558
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2294
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1815
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2051
UnreachableInst * CreateUnreachable()
Definition: IRBuilder.h:1306
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2199
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2555
CallInst * CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue, unsigned Alignment, Value *OffsetValue=nullptr)
Create an assume intrinsic call that represents an alignment assumption on the provided pointer.
Definition: IRBuilder.cpp:1265
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1053
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:194
Value * CreateStructGEP(Type *Ty, Value *Ptr, unsigned Idx, const Twine &Name="")
Definition: IRBuilder.h:1980
IntegerType * getIndexTy(const DataLayout &DL, unsigned AddrSpace)
Fetch the type of an integer that should be used to index GEP operations within AddressSpace.
Definition: IRBuilder.h:600
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2045
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2147
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:545
Value * CreateNSWAdd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1379
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:193
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:239
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:550
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition: IRBuilder.h:1882
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2211
Value * CreateUDiv(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1421
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2274
Value * CreateNUWAdd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1383
IntegerType * getInt16Ty()
Fetch the type representing a 16-bit integer.
Definition: IRBuilder.h:540
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1874
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:510
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition: IRBuilder.h:1733
InsertPoint saveIP() const
Returns the current insert point.
Definition: IRBuilder.h:296
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:505
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2404
Value * CreateFPCast(Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2246
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2435
SwitchInst * CreateSwitch(Value *V, BasicBlock *Dest, unsigned NumCases=10, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a switch instruction with the specified value, default dest, and with a hint for the number of...
Definition: IRBuilder.h:1187
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2270
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:164
DebugLoc getCurrentDebugLocation() const
Get location information used by debugging information.
Definition: IRBuilder.cpp:64
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1387
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2152
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition: IRBuilder.h:516
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1164
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1798
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1459
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2033
LLVMContext & getContext() const
Definition: IRBuilder.h:195
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1518
ReturnInst * CreateRetVoid()
Create a 'ret void' instruction.
Definition: IRBuilder.h:1134
Value * CreateConstInBoundsGEP2_32(Type *Ty, Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name="")
Definition: IRBuilder.h:1921
Value * CreateConstInBoundsGEP2_64(Type *Ty, Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name="")
Definition: IRBuilder.h:1967
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1811
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1370
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2142
Value * CreateExactUDiv(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1430
Value * CreateIsNotNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg != 0.
Definition: IRBuilder.h:2588
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2449
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1862
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2019
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1540
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:588
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1158
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:188
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2302
ConstantInt * getInt16(uint16_t C)
Get a constant 16-bit value.
Definition: IRBuilder.h:500
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2282
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2225
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition: IRBuilder.h:308
Value * CreateIsNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg == 0.
Definition: IRBuilder.h:2583
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
Type * getVoidTy()
Fetch the type representing void.
Definition: IRBuilder.h:583
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1834
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1499
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1562
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2380
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:535
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1447
CallInst * CreateMemCpy(Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, uint64_t Size, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *TBAAStructTag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memcpy between the specified pointers.
Definition: IRBuilder.h:677
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2066
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2157
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1404
GlobalVariable * CreateGlobalString(StringRef Str, const Twine &Name="", unsigned AddressSpace=0, Module *M=nullptr, bool AddNull=true)
Make a new global variable with initializer type i8*.
Definition: IRBuilder.cpp:44
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2086
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:80
void moveBeforePreserving(Instruction *MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:475
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:68
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:390
BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:472
void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
Definition: DerivedTypes.h:42
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:241
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:211
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition: LoopInfo.cpp:969
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Definition: MDBuilder.cpp:118
Metadata node.
Definition: Metadata.h:1073
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1077
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1557
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1434
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1432
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1549
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:606
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type size() const
Definition: MapVector.h:60
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
NamedMDNode * getNamedMetadata(StringRef Name) const
Return the first NamedMDNode in the module with the specified name.
Definition: Module.cpp:297
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:302
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:228
StringRef getName() const
Get a short "name" for the module.
Definition: Module.h:285
const std::string & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition: Module.h:298
iterator_range< global_iterator > globals()
Definition: Module.h:702
const FunctionListType & getFunctionList() const
Get the Module's list of functions (constant).
Definition: Module.h:614
GlobalVariable * getGlobalVariable(StringRef Name) const
Look up the specified global variable in the module symbol table.
Definition: Module.h:447
GlobalValue * getNamedValue(StringRef Name) const
Return the global value in the module with the specified name, of arbitrary type.
Definition: Module.cpp:170
NamedMDNode * getOrInsertNamedMetadata(StringRef Name)
Return the named MDNode in the module with the specified name.
Definition: Module.cpp:304
const GlobalVariable * getNamedGlobal(StringRef Name) const
Return the global variable in the module with the specified name, of arbitrary type.
Definition: Module.h:462
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:294
A tuple of MDNodes.
Definition: Metadata.h:1737
iterator_range< op_iterator > operands()
Definition: Metadata.h:1833
void addOperand(MDNode *M)
Definition: Metadata.cpp:1431
@ OffloadingEntryInfoTargetRegion
Entry is a target region.
Definition: OMPIRBuilder.h:244
@ OffloadingEntryInfoDeviceGlobalVar
Entry is a declare target variable.
Definition: OMPIRBuilder.h:246
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
Definition: OMPIRBuilder.h:377
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
Definition: OMPIRBuilder.h:379
void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
Definition: OMPIRBuilder.h:297
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
Definition: OMPIRBuilder.h:299
void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
unsigned size() const
Return number of entries defined so far.
Definition: OMPIRBuilder.h:288
void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
Definition: OMPIRBuilder.h:357
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
Definition: OMPIRBuilder.h:363
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
Definition: OMPIRBuilder.h:369
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
Definition: OMPIRBuilder.h:367
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
Definition: OMPIRBuilder.h:361
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
Definition: OMPIRBuilder.h:359
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
Definition: OMPIRBuilder.h:433
bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
Definition: OMPIRBuilder.h:93
void setGridValue(omp::GV G)
Definition: OMPIRBuilder.h:189
StringRef separator() const
Definition: OMPIRBuilder.h:175
int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
StringRef firstSeparator() const
Definition: OMPIRBuilder.h:165
std::optional< bool > EmitLLVMUsedMetaInfo
Flag for specifying if LLVMUsed information should be emitted.
Definition: OMPIRBuilder.h:106
omp::GV getGridValue() const
Definition: OMPIRBuilder.h:148
void setHasRequiresReverseOffload(bool Value)
bool hasRequiresUnifiedSharedMemory() const
void setHasRequiresUnifiedSharedMemory(bool Value)
bool hasRequiresDynamicAllocators() const
bool openMPOffloadMandatory() const
Definition: OMPIRBuilder.h:142
void setHasRequiresUnifiedAddress(bool Value)
void setHasRequiresDynamicAllocators(bool Value)
void setEmitLLVMUsed(bool Value=true)
Definition: OMPIRBuilder.h:185
bool hasRequiresReverseOffload() const
bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
Definition: OMPIRBuilder.h:474
InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for '#omp ordered [threads | simd]'.
Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
InsertPointOrErrorTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for '#omp cancel'.
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
InsertPointOrErrorTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, SmallVector< DependData > Dependencies={}, bool Mergeable=false, Value *EventHandle=nullptr, Value *Priority=nullptr)
Generator for #omp task
void createTaskyield(const LocationDescription &Loc)
Generator for '#omp taskyield'.
std::function< Error(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
Definition: OMPIRBuilder.h:543
void emitBranch(BasicBlock *Target)
InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO)
Emit atomic write for : X = Expr — Only Scalar data types.
static void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
static TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
GlobalVariable * emitKernelExecutionMode(StringRef KernelName, omp::OMPTgtExecModeFlags Mode)
Emit the kernel execution mode.
void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
InsertPointOrErrorTy emitTargetTask(TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait)
Generate a target-task for the target construct.
void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
static std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO)
Emit atomic Read for : V = X — Only Scalar data types.
Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
void emitUsed(StringRef Name, ArrayRef< llvm::WeakTrackingVH > List)
Emit the llvm.used metadata.
InsertPointOrErrorTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for '#omp single'.
InsertPointOrErrorTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
void createTaskwait(const LocationDescription &Loc)
Generator for '#omp taskwait'.
CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
InsertPointOrErrorTy createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for '#omp parallel'.
omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
Error emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?...
InsertPointOrErrorTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for '#omp ordered depend (source | sink)'.
InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
void emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
InsertPointTy createTargetInit(const LocationDescription &Loc, const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs)
The omp target interface.
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_Alloc.
void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
InsertPointOrErrorTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp section'.
void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
void emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
InsertPointOrErrorTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp master'.
Error emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective, FinalizeCallbackTy ExitCB={})
Generate control flow and cleanup for cancellation.
InsertPointOrErrorTy emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
InsertPointOrErrorTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, SmallVector< DependData > Dependencies={}, bool HasNowait=false)
Generator for '#omp target'.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, unsigned AddressSpace=0)
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
InsertPointOrErrorTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, bool IsNoWait=false, bool IsTeamsReduction=false, bool HasDistribute=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
Function * emitUserDefinedMapper(function_ref< MapInfosTy &(InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)> PrivAndGenMapInfoCB, llvm::Type *ElemTy, StringRef FuncName, function_ref< bool(unsigned int, Function **)> CustomMapperCB=nullptr)
Emit the user-defined mapper function.
CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
Definition: OMPIRBuilder.h:520
InsertPointOrErrorTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false)
Generator for '#omp reduction'.
GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
std::function< Expected< Function * >(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
static void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
InsertPointOrErrorTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
InsertPointOrErrorTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for '#omp critical'.
void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
InsertPointOrErrorTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop)
Modifies the canonical loop to be a workshare loop.
void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
InsertPointOrErrorTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for '#omp masked'.
Expected< CanonicalLoopInfo * > createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
InsertPointOrErrorTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointOrErrorTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for '#omp target data'.
BodyGenTy
Type of BodyGen to use for region codegen.
InsertPointOrErrorTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
Expected< InsertPointTy > InsertPointOrErrorTy
Type used to represent an insertion point or an error value.
Definition: OMPIRBuilder.h:523
InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
InsertPointOrErrorTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for '#omp sections'.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
void createFlush(const LocationDescription &Loc)
Generator for '#omp flush'.
Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Class to represent pointers.
Definition: DerivedTypes.h:670
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:686
Analysis pass that exposes the ScalarEvolution for a function.
ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
A vector that has set insertion semantics.
Definition: SetVector.h:57
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition: SetVector.h:237
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition: SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
Definition: SmallString.h:254
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void reserve(size_type N)
Definition: SmallVector.h:663
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
void setAlignment(Align Align)
Definition: Instructions.h:337
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Definition: Instructions.h:364
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition: StringMap.h:128
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: StringMap.h:253
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:700
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:147
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:150
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition: StringRef.h:451
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:277
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition: StringRef.h:616
Class to represent struct types.
Definition: DerivedTypes.h:218
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:406
static StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:612
Multiway switch.
void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(StringRef TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition: Triple.h:990
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition: Triple.h:1048
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition: Triple.h:1058
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
std::string str() const
Return the twine contents as a std::string.
Definition: Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
Type * getStructElementType(unsigned N) const
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isStructTy() const
True if this is an instance of StructType.
Definition: Type.h:258
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition: UnrollLoop.h:128
bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition: UnrollLoop.h:144
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
void setOperand(unsigned i, Value *Val)
Definition: User.h:233
Value * getOperand(unsigned i) const
Definition: User.h:228
ValueT lookup(const KeyT &Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: ValueMap.h:164
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
User * user_back()
Definition: Value.h:407
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:927
void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:542
User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition: Value.cpp:179
bool use_empty() const
Definition: Value.h:344
user_iterator user_end()
Definition: Value.h:405
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
iterator_range< use_iterator > uses()
Definition: Value.h:376
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:353
iterator insertAfter(iterator where, pointer New)
Definition: ilist.h:174
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Exit
Definition: COFF.h:845
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
void emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name, uint64_t Size, int32_t Flags, int32_t Data, StringRef SectionName)
Create an offloading section struct used to register this global at runtime.
Definition: Utility.cpp:77
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
Definition: OMPConstants.h:195
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
Definition: OMPConstants.h:252
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
Definition: OMPConstants.h:65
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
Definition: OMPConstants.h:45
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
WorksharingLoopType
A type of worksharing loop construct.
Definition: OMPConstants.h:283
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
Definition: OMPConstants.h:267
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
std::error_code getUniqueID(const Twine Path, UniqueID &Result)
Definition: Path.cpp:787
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:854
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition: STLExtras.h:864
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:649
auto successors(const MachineBasicBlock *BB)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
Definition: BitcodeReader.h:66
bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:342
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
Definition: LoopPeel.cpp:870
void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:405
BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition: Error.h:756
bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
DWARFExpression::Operation Op
void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
@ Continue
Definition: DWP.h:21
void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
a struct to pack relevant information while generating atomic Ops
A struct to pack the relevant information for an OpenMP depend clause.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
Definition: OMPIRBuilder.h:645
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
Value * DynCGGroupMem
The size of the dynamic shared memory.
ArrayRef< Value * > NumThreads
The number of threads.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
Container to pass the default attributes with which a kernel must be launched, used to set kernel att...
Container to pass LLVM IR runtime values or constants related to the number of teams and threads with...
Value * MaxThreads
'parallel' construct 'num_threads' clause value, if present and it is an SPMD kernel.
Value * LoopTripCount
Total number of iterations of the SPMD or Generic-SPMD kernel or null if it is a generic kernel.
Data structure to contain the information needed to uniquely identify a target entry.
Definition: OMPIRBuilder.h:203
static void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static const Target * lookupTarget(StringRef Triple, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...
Definition: OMPGridValues.h:57
unsigned GV_Warp_Size
The default value of maximum number of threads in a worker warp.
Definition: OMPGridValues.h:61