LLVM 20.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/StringRef.h"
29#include "llvm/IR/Attributes.h"
30#include "llvm/IR/BasicBlock.h"
31#include "llvm/IR/CFG.h"
32#include "llvm/IR/CallingConv.h"
33#include "llvm/IR/Constant.h"
34#include "llvm/IR/Constants.h"
35#include "llvm/IR/DIBuilder.h"
38#include "llvm/IR/Function.h"
40#include "llvm/IR/IRBuilder.h"
41#include "llvm/IR/LLVMContext.h"
42#include "llvm/IR/MDBuilder.h"
43#include "llvm/IR/Metadata.h"
45#include "llvm/IR/PassManager.h"
47#include "llvm/IR/Value.h"
59
60#include <cstdint>
61#include <optional>
62
63#define DEBUG_TYPE "openmp-ir-builder"
64
65using namespace llvm;
66using namespace omp;
67
68static cl::opt<bool>
69 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
70 cl::desc("Use optimistic attributes describing "
71 "'as-if' properties of runtime calls."),
72 cl::init(false));
73
75 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
76 cl::desc("Factor for the unroll threshold to account for code "
77 "simplifications still taking place"),
78 cl::init(1.5));
79
80#ifndef NDEBUG
81/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
82/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
83/// an InsertPoint stores the instruction before something is inserted. For
84/// instance, if both point to the same instruction, two IRBuilders alternating
85/// creating instruction will cause the instructions to be interleaved.
88 if (!IP1.isSet() || !IP2.isSet())
89 return false;
90 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
91}
92
94 // Valid ordered/unordered and base algorithm combinations.
95 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
96 case OMPScheduleType::UnorderedStaticChunked:
97 case OMPScheduleType::UnorderedStatic:
98 case OMPScheduleType::UnorderedDynamicChunked:
99 case OMPScheduleType::UnorderedGuidedChunked:
100 case OMPScheduleType::UnorderedRuntime:
101 case OMPScheduleType::UnorderedAuto:
102 case OMPScheduleType::UnorderedTrapezoidal:
103 case OMPScheduleType::UnorderedGreedy:
104 case OMPScheduleType::UnorderedBalanced:
105 case OMPScheduleType::UnorderedGuidedIterativeChunked:
106 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
107 case OMPScheduleType::UnorderedSteal:
108 case OMPScheduleType::UnorderedStaticBalancedChunked:
109 case OMPScheduleType::UnorderedGuidedSimd:
110 case OMPScheduleType::UnorderedRuntimeSimd:
111 case OMPScheduleType::OrderedStaticChunked:
112 case OMPScheduleType::OrderedStatic:
113 case OMPScheduleType::OrderedDynamicChunked:
114 case OMPScheduleType::OrderedGuidedChunked:
115 case OMPScheduleType::OrderedRuntime:
116 case OMPScheduleType::OrderedAuto:
117 case OMPScheduleType::OrderdTrapezoidal:
118 case OMPScheduleType::NomergeUnorderedStaticChunked:
119 case OMPScheduleType::NomergeUnorderedStatic:
120 case OMPScheduleType::NomergeUnorderedDynamicChunked:
121 case OMPScheduleType::NomergeUnorderedGuidedChunked:
122 case OMPScheduleType::NomergeUnorderedRuntime:
123 case OMPScheduleType::NomergeUnorderedAuto:
124 case OMPScheduleType::NomergeUnorderedTrapezoidal:
125 case OMPScheduleType::NomergeUnorderedGreedy:
126 case OMPScheduleType::NomergeUnorderedBalanced:
127 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
128 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
129 case OMPScheduleType::NomergeUnorderedSteal:
130 case OMPScheduleType::NomergeOrderedStaticChunked:
131 case OMPScheduleType::NomergeOrderedStatic:
132 case OMPScheduleType::NomergeOrderedDynamicChunked:
133 case OMPScheduleType::NomergeOrderedGuidedChunked:
134 case OMPScheduleType::NomergeOrderedRuntime:
135 case OMPScheduleType::NomergeOrderedAuto:
136 case OMPScheduleType::NomergeOrderedTrapezoidal:
137 break;
138 default:
139 return false;
140 }
141
142 // Must not set both monotonicity modifiers at the same time.
143 OMPScheduleType MonotonicityFlags =
144 SchedType & OMPScheduleType::MonotonicityMask;
145 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
146 return false;
147
148 return true;
149}
150#endif
151
152static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
153 if (T.isAMDGPU()) {
154 StringRef Features =
155 Kernel->getFnAttribute("target-features").getValueAsString();
156 if (Features.count("+wavefrontsize64"))
157 return omp::getAMDGPUGridValues<64>();
158 return omp::getAMDGPUGridValues<32>();
159 }
160 if (T.isNVPTX())
162 llvm_unreachable("No grid value available for this architecture!");
163}
164
165/// Determine which scheduling algorithm to use, determined from schedule clause
166/// arguments.
167static OMPScheduleType
168getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
169 bool HasSimdModifier) {
170 // Currently, the default schedule it static.
171 switch (ClauseKind) {
172 case OMP_SCHEDULE_Default:
173 case OMP_SCHEDULE_Static:
174 return HasChunks ? OMPScheduleType::BaseStaticChunked
175 : OMPScheduleType::BaseStatic;
176 case OMP_SCHEDULE_Dynamic:
177 return OMPScheduleType::BaseDynamicChunked;
178 case OMP_SCHEDULE_Guided:
179 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
180 : OMPScheduleType::BaseGuidedChunked;
181 case OMP_SCHEDULE_Auto:
183 case OMP_SCHEDULE_Runtime:
184 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
185 : OMPScheduleType::BaseRuntime;
186 }
187 llvm_unreachable("unhandled schedule clause argument");
188}
189
190/// Adds ordering modifier flags to schedule type.
191static OMPScheduleType
193 bool HasOrderedClause) {
194 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
195 OMPScheduleType::None &&
196 "Must not have ordering nor monotonicity flags already set");
197
198 OMPScheduleType OrderingModifier = HasOrderedClause
199 ? OMPScheduleType::ModifierOrdered
200 : OMPScheduleType::ModifierUnordered;
201 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
202
203 // Unsupported combinations
204 if (OrderingScheduleType ==
205 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
206 return OMPScheduleType::OrderedGuidedChunked;
207 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
208 OMPScheduleType::ModifierOrdered))
209 return OMPScheduleType::OrderedRuntime;
210
211 return OrderingScheduleType;
212}
213
214/// Adds monotonicity modifier flags to schedule type.
215static OMPScheduleType
217 bool HasSimdModifier, bool HasMonotonic,
218 bool HasNonmonotonic, bool HasOrderedClause) {
219 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
220 OMPScheduleType::None &&
221 "Must not have monotonicity flags already set");
222 assert((!HasMonotonic || !HasNonmonotonic) &&
223 "Monotonic and Nonmonotonic are contradicting each other");
224
225 if (HasMonotonic) {
226 return ScheduleType | OMPScheduleType::ModifierMonotonic;
227 } else if (HasNonmonotonic) {
228 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
229 } else {
230 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
231 // If the static schedule kind is specified or if the ordered clause is
232 // specified, and if the nonmonotonic modifier is not specified, the
233 // effect is as if the monotonic modifier is specified. Otherwise, unless
234 // the monotonic modifier is specified, the effect is as if the
235 // nonmonotonic modifier is specified.
236 OMPScheduleType BaseScheduleType =
237 ScheduleType & ~OMPScheduleType::ModifierMask;
238 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
239 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
240 HasOrderedClause) {
241 // The monotonic is used by default in openmp runtime library, so no need
242 // to set it.
243 return ScheduleType;
244 } else {
245 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
246 }
247 }
248}
249
250/// Determine the schedule type using schedule and ordering clause arguments.
251static OMPScheduleType
252computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
253 bool HasSimdModifier, bool HasMonotonicModifier,
254 bool HasNonmonotonicModifier, bool HasOrderedClause) {
255 OMPScheduleType BaseSchedule =
256 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
257 OMPScheduleType OrderedSchedule =
258 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
260 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
261 HasNonmonotonicModifier, HasOrderedClause);
262
264 return Result;
265}
266
267/// Make \p Source branch to \p Target.
268///
269/// Handles two situations:
270/// * \p Source already has an unconditional branch.
271/// * \p Source is a degenerate block (no terminator because the BB is
272/// the current head of the IR construction).
274 if (Instruction *Term = Source->getTerminator()) {
275 auto *Br = cast<BranchInst>(Term);
276 assert(!Br->isConditional() &&
277 "BB's terminator must be an unconditional branch (or degenerate)");
278 BasicBlock *Succ = Br->getSuccessor(0);
279 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
280 Br->setSuccessor(0, Target);
281 return;
282 }
283
284 auto *NewBr = BranchInst::Create(Target, Source);
285 NewBr->setDebugLoc(DL);
286}
287
289 bool CreateBranch) {
290 assert(New->getFirstInsertionPt() == New->begin() &&
291 "Target BB must not have PHI nodes");
292
293 // Move instructions to new block.
294 BasicBlock *Old = IP.getBlock();
295 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
296
297 if (CreateBranch)
298 BranchInst::Create(New, Old);
299}
300
301void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
303 BasicBlock *Old = Builder.GetInsertBlock();
304
305 spliceBB(Builder.saveIP(), New, CreateBranch);
306 if (CreateBranch)
307 Builder.SetInsertPoint(Old->getTerminator());
308 else
309 Builder.SetInsertPoint(Old);
310
311 // SetInsertPoint also updates the Builder's debug location, but we want to
312 // keep the one the Builder was configured to use.
314}
315
318 BasicBlock *Old = IP.getBlock();
320 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
321 Old->getParent(), Old->getNextNode());
322 spliceBB(IP, New, CreateBranch);
323 New->replaceSuccessorsPhiUsesWith(Old, New);
324 return New;
325}
326
327BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
330 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
331 if (CreateBranch)
332 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
333 else
334 Builder.SetInsertPoint(Builder.GetInsertBlock());
335 // SetInsertPoint also updates the Builder's debug location, but we want to
336 // keep the one the Builder was configured to use.
338 return New;
339}
340
341BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
344 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
345 if (CreateBranch)
346 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
347 else
348 Builder.SetInsertPoint(Builder.GetInsertBlock());
349 // SetInsertPoint also updates the Builder's debug location, but we want to
350 // keep the one the Builder was configured to use.
352 return New;
353}
354
356 llvm::Twine Suffix) {
357 BasicBlock *Old = Builder.GetInsertBlock();
358 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
359}
360
361// This function creates a fake integer value and a fake use for the integer
362// value. It returns the fake value created. This is useful in modeling the
363// extra arguments to the outlined functions.
365 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
367 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
368 const Twine &Name = "", bool AsPtr = true) {
369 Builder.restoreIP(OuterAllocaIP);
370 Instruction *FakeVal;
371 AllocaInst *FakeValAddr =
372 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
373 ToBeDeleted.push_back(FakeValAddr);
374
375 if (AsPtr) {
376 FakeVal = FakeValAddr;
377 } else {
378 FakeVal =
379 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
380 ToBeDeleted.push_back(FakeVal);
381 }
382
383 // Generate a fake use of this value
384 Builder.restoreIP(InnerAllocaIP);
385 Instruction *UseFakeVal;
386 if (AsPtr) {
387 UseFakeVal =
388 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
389 } else {
390 UseFakeVal =
391 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
392 }
393 ToBeDeleted.push_back(UseFakeVal);
394 return FakeVal;
395}
396
397//===----------------------------------------------------------------------===//
398// OpenMPIRBuilderConfig
399//===----------------------------------------------------------------------===//
400
401namespace {
403/// Values for bit flags for marking which requires clauses have been used.
404enum OpenMPOffloadingRequiresDirFlags {
405 /// flag undefined.
406 OMP_REQ_UNDEFINED = 0x000,
407 /// no requires directive present.
408 OMP_REQ_NONE = 0x001,
409 /// reverse_offload clause.
410 OMP_REQ_REVERSE_OFFLOAD = 0x002,
411 /// unified_address clause.
412 OMP_REQ_UNIFIED_ADDRESS = 0x004,
413 /// unified_shared_memory clause.
414 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
415 /// dynamic_allocators clause.
416 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
417 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
418};
419
420} // anonymous namespace
421
423 : RequiresFlags(OMP_REQ_UNDEFINED) {}
424
426 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
427 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
428 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
429 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
430 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
431 RequiresFlags(OMP_REQ_UNDEFINED) {
432 if (HasRequiresReverseOffload)
433 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
434 if (HasRequiresUnifiedAddress)
435 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
436 if (HasRequiresUnifiedSharedMemory)
437 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
438 if (HasRequiresDynamicAllocators)
439 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
440}
441
443 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
444}
445
447 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
448}
449
451 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
452}
453
455 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
456}
457
459 return hasRequiresFlags() ? RequiresFlags
460 : static_cast<int64_t>(OMP_REQ_NONE);
461}
462
464 if (Value)
465 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
466 else
467 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
468}
469
471 if (Value)
472 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
473 else
474 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
475}
476
478 if (Value)
479 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
480 else
481 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
482}
483
485 if (Value)
486 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
487 else
488 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
489}
490
491//===----------------------------------------------------------------------===//
492// OpenMPIRBuilder
493//===----------------------------------------------------------------------===//
494
496 IRBuilderBase &Builder,
497 SmallVector<Value *> &ArgsVector) {
499 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
500 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
501 constexpr const size_t MaxDim = 3;
502 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
503 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
504
505 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
506
507 Value *NumTeams3D =
508 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
509 Value *NumThreads3D =
510 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
511 for (unsigned I :
512 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
513 NumTeams3D =
514 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
515 for (unsigned I :
516 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
517 NumThreads3D =
518 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
519
520 ArgsVector = {Version,
521 PointerNum,
522 KernelArgs.RTArgs.BasePointersArray,
523 KernelArgs.RTArgs.PointersArray,
524 KernelArgs.RTArgs.SizesArray,
525 KernelArgs.RTArgs.MapTypesArray,
526 KernelArgs.RTArgs.MapNamesArray,
527 KernelArgs.RTArgs.MappersArray,
528 KernelArgs.NumIterations,
529 Flags,
530 NumTeams3D,
531 NumThreads3D,
532 KernelArgs.DynCGGroupMem};
533}
534
536 LLVMContext &Ctx = Fn.getContext();
537
538 // Get the function's current attributes.
539 auto Attrs = Fn.getAttributes();
540 auto FnAttrs = Attrs.getFnAttrs();
541 auto RetAttrs = Attrs.getRetAttrs();
543 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
544 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
545
546 // Add AS to FnAS while taking special care with integer extensions.
547 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
548 bool Param = true) -> void {
549 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
550 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
551 if (HasSignExt || HasZeroExt) {
552 assert(AS.getNumAttributes() == 1 &&
553 "Currently not handling extension attr combined with others.");
554 if (Param) {
555 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
556 FnAS = FnAS.addAttribute(Ctx, AK);
557 } else if (auto AK =
558 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
559 FnAS = FnAS.addAttribute(Ctx, AK);
560 } else {
561 FnAS = FnAS.addAttributes(Ctx, AS);
562 }
563 };
564
565#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
566#include "llvm/Frontend/OpenMP/OMPKinds.def"
567
568 // Add attributes to the function declaration.
569 switch (FnID) {
570#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
571 case Enum: \
572 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
573 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
574 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
575 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
576 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
577 break;
578#include "llvm/Frontend/OpenMP/OMPKinds.def"
579 default:
580 // Attributes are optional.
581 break;
582 }
583}
584
587 FunctionType *FnTy = nullptr;
588 Function *Fn = nullptr;
589
590 // Try to find the declation in the module first.
591 switch (FnID) {
592#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
593 case Enum: \
594 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
595 IsVarArg); \
596 Fn = M.getFunction(Str); \
597 break;
598#include "llvm/Frontend/OpenMP/OMPKinds.def"
599 }
600
601 if (!Fn) {
602 // Create a new declaration if we need one.
603 switch (FnID) {
604#define OMP_RTL(Enum, Str, ...) \
605 case Enum: \
606 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
607 break;
608#include "llvm/Frontend/OpenMP/OMPKinds.def"
609 }
610
611 // Add information if the runtime function takes a callback function
612 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
613 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
614 LLVMContext &Ctx = Fn->getContext();
615 MDBuilder MDB(Ctx);
616 // Annotate the callback behavior of the runtime function:
617 // - The callback callee is argument number 2 (microtask).
618 // - The first two arguments of the callback callee are unknown (-1).
619 // - All variadic arguments to the runtime function are passed to the
620 // callback callee.
621 Fn->addMetadata(
622 LLVMContext::MD_callback,
624 2, {-1, -1}, /* VarArgsArePassed */ true)}));
625 }
626 }
627
628 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
629 << " with type " << *Fn->getFunctionType() << "\n");
630 addAttributes(FnID, *Fn);
631
632 } else {
633 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
634 << " with type " << *Fn->getFunctionType() << "\n");
635 }
636
637 assert(Fn && "Failed to create OpenMP runtime function");
638
639 return {FnTy, Fn};
640}
641
644 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
645 assert(Fn && "Failed to create OpenMP runtime function pointer");
646 return Fn;
647}
648
649void OpenMPIRBuilder::initialize() { initializeTypes(M); }
650
653 BasicBlock &EntryBlock = Function->getEntryBlock();
654 Instruction *MoveLocInst = EntryBlock.getFirstNonPHI();
655
656 // Loop over blocks looking for constant allocas, skipping the entry block
657 // as any allocas there are already in the desired location.
658 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
659 Block++) {
660 for (auto Inst = Block->getReverseIterator()->begin();
661 Inst != Block->getReverseIterator()->end();) {
662 if (auto *AllocaInst = dyn_cast_if_present<llvm::AllocaInst>(Inst)) {
663 Inst++;
664 if (!isa<ConstantData>(AllocaInst->getArraySize()))
665 continue;
666 AllocaInst->moveBeforePreserving(MoveLocInst);
667 } else {
668 Inst++;
669 }
670 }
671 }
672}
673
675 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
677 SmallVector<OutlineInfo, 16> DeferredOutlines;
678 for (OutlineInfo &OI : OutlineInfos) {
679 // Skip functions that have not finalized yet; may happen with nested
680 // function generation.
681 if (Fn && OI.getFunction() != Fn) {
682 DeferredOutlines.push_back(OI);
683 continue;
684 }
685
686 ParallelRegionBlockSet.clear();
687 Blocks.clear();
688 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
689
690 Function *OuterFn = OI.getFunction();
691 CodeExtractorAnalysisCache CEAC(*OuterFn);
692 // If we generate code for the target device, we need to allocate
693 // struct for aggregate params in the device default alloca address space.
694 // OpenMP runtime requires that the params of the extracted functions are
695 // passed as zero address space pointers. This flag ensures that
696 // CodeExtractor generates correct code for extracted functions
697 // which are used by OpenMP runtime.
698 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
699 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
700 /* AggregateArgs */ true,
701 /* BlockFrequencyInfo */ nullptr,
702 /* BranchProbabilityInfo */ nullptr,
703 /* AssumptionCache */ nullptr,
704 /* AllowVarArgs */ true,
705 /* AllowAlloca */ true,
706 /* AllocaBlock*/ OI.OuterAllocaBB,
707 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
708
709 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
710 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
711 << " Exit: " << OI.ExitBB->getName() << "\n");
712 assert(Extractor.isEligible() &&
713 "Expected OpenMP outlining to be possible!");
714
715 for (auto *V : OI.ExcludeArgsFromAggregate)
716 Extractor.excludeArgFromAggregate(V);
717
718 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
719
720 // Forward target-cpu, target-features attributes to the outlined function.
721 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
722 if (TargetCpuAttr.isStringAttribute())
723 OutlinedFn->addFnAttr(TargetCpuAttr);
724
725 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
726 if (TargetFeaturesAttr.isStringAttribute())
727 OutlinedFn->addFnAttr(TargetFeaturesAttr);
728
729 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
730 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
731 assert(OutlinedFn->getReturnType()->isVoidTy() &&
732 "OpenMP outlined functions should not return a value!");
733
734 // For compability with the clang CG we move the outlined function after the
735 // one with the parallel region.
736 OutlinedFn->removeFromParent();
737 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
738
739 // Remove the artificial entry introduced by the extractor right away, we
740 // made our own entry block after all.
741 {
742 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
743 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
744 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
745 // Move instructions from the to-be-deleted ArtificialEntry to the entry
746 // basic block of the parallel region. CodeExtractor generates
747 // instructions to unwrap the aggregate argument and may sink
748 // allocas/bitcasts for values that are solely used in the outlined region
749 // and do not escape.
750 assert(!ArtificialEntry.empty() &&
751 "Expected instructions to add in the outlined region entry");
752 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
753 End = ArtificialEntry.rend();
754 It != End;) {
755 Instruction &I = *It;
756 It++;
757
758 if (I.isTerminator())
759 continue;
760
761 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
762 }
763
764 OI.EntryBB->moveBefore(&ArtificialEntry);
765 ArtificialEntry.eraseFromParent();
766 }
767 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
768 assert(OutlinedFn && OutlinedFn->getNumUses() == 1);
769
770 // Run a user callback, e.g. to add attributes.
771 if (OI.PostOutlineCB)
772 OI.PostOutlineCB(*OutlinedFn);
773 }
774
775 // Remove work items that have been completed.
776 OutlineInfos = std::move(DeferredOutlines);
777
778 // The createTarget functions embeds user written code into
779 // the target region which may inject allocas which need to
780 // be moved to the entry block of our target or risk malformed
781 // optimisations by later passes, this is only relevant for
782 // the device pass which appears to be a little more delicate
783 // when it comes to optimisations (however, we do not block on
784 // that here, it's up to the inserter to the list to do so).
785 // This notbaly has to occur after the OutlinedInfo candidates
786 // have been extracted so we have an end product that will not
787 // be implicitly adversely affected by any raises unless
788 // intentionally appended to the list.
789 // NOTE: This only does so for ConstantData, it could be extended
790 // to ConstantExpr's with further effort, however, they should
791 // largely be folded when they get here. Extending it to runtime
792 // defined/read+writeable allocation sizes would be non-trivial
793 // (need to factor in movement of any stores to variables the
794 // allocation size depends on, as well as the usual loads,
795 // otherwise it'll yield the wrong result after movement) and
796 // likely be more suitable as an LLVM optimisation pass.
799
800 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
801 [](EmitMetadataErrorKind Kind,
802 const TargetRegionEntryInfo &EntryInfo) -> void {
803 errs() << "Error of kind: " << Kind
804 << " when emitting offload entries and metadata during "
805 "OMPIRBuilder finalization \n";
806 };
807
810
811 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
812 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
813 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
814 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
815 }
816}
817
819 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
820}
821
824 auto *GV =
825 new GlobalVariable(M, I32Ty,
826 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
827 ConstantInt::get(I32Ty, Value), Name);
828 GV->setVisibility(GlobalValue::HiddenVisibility);
829
830 return GV;
831}
832
834 if (List.empty())
835 return;
836
837 // Convert List to what ConstantArray needs.
839 UsedArray.resize(List.size());
840 for (unsigned I = 0, E = List.size(); I != E; ++I)
842 cast<Constant>(&*List[I]), Builder.getPtrTy());
843
844 if (UsedArray.empty())
845 return;
846 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
847
848 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
849 ConstantArray::get(ATy, UsedArray), Name);
850
851 GV->setSection("llvm.metadata");
852}
853
856 OMPTgtExecModeFlags Mode) {
857 auto *Int8Ty = Builder.getInt8Ty();
858 auto *GVMode = new GlobalVariable(
859 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
860 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
861 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
862 return GVMode;
863}
864
866 uint32_t SrcLocStrSize,
867 IdentFlag LocFlags,
868 unsigned Reserve2Flags) {
869 // Enable "C-mode".
870 LocFlags |= OMP_IDENT_FLAG_KMPC;
871
872 Constant *&Ident =
873 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
874 if (!Ident) {
876 Constant *IdentData[] = {I32Null,
877 ConstantInt::get(Int32, uint32_t(LocFlags)),
878 ConstantInt::get(Int32, Reserve2Flags),
879 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
880 Constant *Initializer =
881 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
882
883 // Look for existing encoding of the location + flags, not needed but
884 // minimizes the difference to the existing solution while we transition.
885 for (GlobalVariable &GV : M.globals())
886 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
887 if (GV.getInitializer() == Initializer)
888 Ident = &GV;
889
890 if (!Ident) {
891 auto *GV = new GlobalVariable(
892 M, OpenMPIRBuilder::Ident,
893 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
896 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
897 GV->setAlignment(Align(8));
898 Ident = GV;
899 }
900 }
901
903}
904
906 uint32_t &SrcLocStrSize) {
907 SrcLocStrSize = LocStr.size();
908 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
909 if (!SrcLocStr) {
910 Constant *Initializer =
912
913 // Look for existing encoding of the location, not needed but minimizes the
914 // difference to the existing solution while we transition.
915 for (GlobalVariable &GV : M.globals())
916 if (GV.isConstant() && GV.hasInitializer() &&
917 GV.getInitializer() == Initializer)
918 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
919
920 SrcLocStr = Builder.CreateGlobalString(LocStr, /* Name */ "",
921 /* AddressSpace */ 0, &M);
922 }
923 return SrcLocStr;
924}
925
927 StringRef FileName,
928 unsigned Line, unsigned Column,
929 uint32_t &SrcLocStrSize) {
930 SmallString<128> Buffer;
931 Buffer.push_back(';');
932 Buffer.append(FileName);
933 Buffer.push_back(';');
934 Buffer.append(FunctionName);
935 Buffer.push_back(';');
936 Buffer.append(std::to_string(Line));
937 Buffer.push_back(';');
938 Buffer.append(std::to_string(Column));
939 Buffer.push_back(';');
940 Buffer.push_back(';');
941 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
942}
943
944Constant *
946 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
947 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
948}
949
951 uint32_t &SrcLocStrSize,
952 Function *F) {
953 DILocation *DIL = DL.get();
954 if (!DIL)
955 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
956 StringRef FileName = M.getName();
957 if (DIFile *DIF = DIL->getFile())
958 if (std::optional<StringRef> Source = DIF->getSource())
959 FileName = *Source;
960 StringRef Function = DIL->getScope()->getSubprogram()->getName();
961 if (Function.empty() && F)
962 Function = F->getName();
963 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
964 DIL->getColumn(), SrcLocStrSize);
965}
966
968 uint32_t &SrcLocStrSize) {
969 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
970 Loc.IP.getBlock()->getParent());
971}
972
974 return Builder.CreateCall(
975 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
976 "omp_global_thread_num");
977}
978
981 bool ForceSimpleCall, bool CheckCancelFlag) {
982 if (!updateToLocation(Loc))
983 return Loc.IP;
984
985 // Build call __kmpc_cancel_barrier(loc, thread_id) or
986 // __kmpc_barrier(loc, thread_id);
987
988 IdentFlag BarrierLocFlags;
989 switch (Kind) {
990 case OMPD_for:
991 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
992 break;
993 case OMPD_sections:
994 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
995 break;
996 case OMPD_single:
997 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
998 break;
999 case OMPD_barrier:
1000 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1001 break;
1002 default:
1003 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1004 break;
1005 }
1006
1007 uint32_t SrcLocStrSize;
1008 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1009 Value *Args[] = {
1010 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1011 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1012
1013 // If we are in a cancellable parallel region, barriers are cancellation
1014 // points.
1015 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1016 bool UseCancelBarrier =
1017 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1018
1019 Value *Result =
1021 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
1022 : OMPRTL___kmpc_barrier),
1023 Args);
1024
1025 if (UseCancelBarrier && CheckCancelFlag)
1026 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1027 return Err;
1028
1029 return Builder.saveIP();
1030}
1031
1034 Value *IfCondition,
1035 omp::Directive CanceledDirective) {
1036 if (!updateToLocation(Loc))
1037 return Loc.IP;
1038
1039 // LLVM utilities like blocks with terminators.
1040 auto *UI = Builder.CreateUnreachable();
1041
1042 Instruction *ThenTI = UI, *ElseTI = nullptr;
1043 if (IfCondition)
1044 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1045 Builder.SetInsertPoint(ThenTI);
1046
1047 Value *CancelKind = nullptr;
1048 switch (CanceledDirective) {
1049#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1050 case DirectiveEnum: \
1051 CancelKind = Builder.getInt32(Value); \
1052 break;
1053#include "llvm/Frontend/OpenMP/OMPKinds.def"
1054 default:
1055 llvm_unreachable("Unknown cancel kind!");
1056 }
1057
1058 uint32_t SrcLocStrSize;
1059 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1060 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1061 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1062 Value *Result = Builder.CreateCall(
1063 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1064 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1065 if (CanceledDirective == OMPD_parallel) {
1067 Builder.restoreIP(IP);
1069 omp::Directive::OMPD_unknown,
1070 /* ForceSimpleCall */ false,
1071 /* CheckCancelFlag */ false)
1072 .takeError();
1073 }
1074 return Error::success();
1075 };
1076
1077 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1078 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1079 return Err;
1080
1081 // Update the insertion point and remove the terminator we introduced.
1082 Builder.SetInsertPoint(UI->getParent());
1083 UI->eraseFromParent();
1084
1085 return Builder.saveIP();
1086}
1087
1089 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1090 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1091 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1092 if (!updateToLocation(Loc))
1093 return Loc.IP;
1094
1095 Builder.restoreIP(AllocaIP);
1096 auto *KernelArgsPtr =
1097 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1098 Builder.restoreIP(Loc.IP);
1099
1100 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1101 llvm::Value *Arg =
1102 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1104 KernelArgs[I], Arg,
1105 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1106 }
1107
1108 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1109 NumThreads, HostPtr, KernelArgsPtr};
1110
1111 Return = Builder.CreateCall(
1112 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1113 OffloadingArgs);
1114
1115 return Builder.saveIP();
1116}
1117
1119 const LocationDescription &Loc, Value *OutlinedFnID,
1120 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1121 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1122
1123 if (!updateToLocation(Loc))
1124 return Loc.IP;
1125
1126 Builder.restoreIP(Loc.IP);
1127 // On top of the arrays that were filled up, the target offloading call
1128 // takes as arguments the device id as well as the host pointer. The host
1129 // pointer is used by the runtime library to identify the current target
1130 // region, so it only has to be unique and not necessarily point to
1131 // anything. It could be the pointer to the outlined function that
1132 // implements the target region, but we aren't using that so that the
1133 // compiler doesn't need to keep that, and could therefore inline the host
1134 // function if proven worthwhile during optimization.
1135
1136 // From this point on, we need to have an ID of the target region defined.
1137 assert(OutlinedFnID && "Invalid outlined function ID!");
1138 (void)OutlinedFnID;
1139
1140 // Return value of the runtime offloading call.
1141 Value *Return = nullptr;
1142
1143 // Arguments for the target kernel.
1144 SmallVector<Value *> ArgsVector;
1145 getKernelArgsVector(Args, Builder, ArgsVector);
1146
1147 // The target region is an outlined function launched by the runtime
1148 // via calls to __tgt_target_kernel().
1149 //
1150 // Note that on the host and CPU targets, the runtime implementation of
1151 // these calls simply call the outlined function without forking threads.
1152 // The outlined functions themselves have runtime calls to
1153 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1154 // the compiler in emitTeamsCall() and emitParallelCall().
1155 //
1156 // In contrast, on the NVPTX target, the implementation of
1157 // __tgt_target_teams() launches a GPU kernel with the requested number
1158 // of teams and threads so no additional calls to the runtime are required.
1159 // Check the error code and execute the host version if required.
1161 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1162 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1163
1164 BasicBlock *OffloadFailedBlock =
1165 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1166 BasicBlock *OffloadContBlock =
1167 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1169 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1170
1171 auto CurFn = Builder.GetInsertBlock()->getParent();
1172 emitBlock(OffloadFailedBlock, CurFn);
1173 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1174 if (!AfterIP)
1175 return AfterIP.takeError();
1176 Builder.restoreIP(*AfterIP);
1177 emitBranch(OffloadContBlock);
1178 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1179 return Builder.saveIP();
1180}
1181
1183 Value *CancelFlag, omp::Directive CanceledDirective,
1184 FinalizeCallbackTy ExitCB) {
1185 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1186 "Unexpected cancellation!");
1187
1188 // For a cancel barrier we create two new blocks.
1190 BasicBlock *NonCancellationBlock;
1191 if (Builder.GetInsertPoint() == BB->end()) {
1192 // TODO: This branch will not be needed once we moved to the
1193 // OpenMPIRBuilder codegen completely.
1194 NonCancellationBlock = BasicBlock::Create(
1195 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1196 } else {
1197 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1200 }
1201 BasicBlock *CancellationBlock = BasicBlock::Create(
1202 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1203
1204 // Jump to them based on the return value.
1205 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1206 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1207 /* TODO weight */ nullptr, nullptr);
1208
1209 // From the cancellation block we finalize all variables and go to the
1210 // post finalization block that is known to the FiniCB callback.
1211 Builder.SetInsertPoint(CancellationBlock);
1212 if (ExitCB)
1213 if (Error Err = ExitCB(Builder.saveIP()))
1214 return Err;
1215 auto &FI = FinalizationStack.back();
1216 if (Error Err = FI.FiniCB(Builder.saveIP()))
1217 return Err;
1218
1219 // The continuation block is where code generation continues.
1220 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1221 return Error::success();
1222}
1223
1224// Callback used to create OpenMP runtime calls to support
1225// omp parallel clause for the device.
1226// We need to use this callback to replace call to the OutlinedFn in OuterFn
1227// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1229 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1230 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1231 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1232 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1233 // Add some known attributes.
1234 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1235 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1236 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1237 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1238 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1239 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1240
1241 assert(OutlinedFn.arg_size() >= 2 &&
1242 "Expected at least tid and bounded tid as arguments");
1243 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1244
1245 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1246 assert(CI && "Expected call instruction to outlined function");
1247 CI->getParent()->setName("omp_parallel");
1248
1249 Builder.SetInsertPoint(CI);
1250 Type *PtrTy = OMPIRBuilder->VoidPtr;
1251 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1252
1253 // Add alloca for kernel args
1254 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1255 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1256 AllocaInst *ArgsAlloca =
1257 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1258 Value *Args = ArgsAlloca;
1259 // Add address space cast if array for storing arguments is not allocated
1260 // in address space 0
1261 if (ArgsAlloca->getAddressSpace())
1262 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1263 Builder.restoreIP(CurrentIP);
1264
1265 // Store captured vars which are used by kmpc_parallel_51
1266 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1267 Value *V = *(CI->arg_begin() + 2 + Idx);
1268 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1269 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1270 Builder.CreateStore(V, StoreAddress);
1271 }
1272
1273 Value *Cond =
1274 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1275 : Builder.getInt32(1);
1276
1277 // Build kmpc_parallel_51 call
1278 Value *Parallel51CallArgs[] = {
1279 /* identifier*/ Ident,
1280 /* global thread num*/ ThreadID,
1281 /* if expression */ Cond,
1282 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1283 /* Proc bind */ Builder.getInt32(-1),
1284 /* outlined function */
1285 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr),
1286 /* wrapper function */ NullPtrValue,
1287 /* arguments of the outlined funciton*/ Args,
1288 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1289
1290 FunctionCallee RTLFn =
1291 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1292
1293 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1294
1295 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1296 << *Builder.GetInsertBlock()->getParent() << "\n");
1297
1298 // Initialize the local TID stack location with the argument value.
1299 Builder.SetInsertPoint(PrivTID);
1300 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1301 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1302 PrivTIDAddr);
1303
1304 // Remove redundant call to the outlined function.
1305 CI->eraseFromParent();
1306
1307 for (Instruction *I : ToBeDeleted) {
1308 I->eraseFromParent();
1309 }
1310}
1311
1312// Callback used to create OpenMP runtime calls to support
1313// omp parallel clause for the host.
1314// We need to use this callback to replace call to the OutlinedFn in OuterFn
1315// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1316static void
1318 Function *OuterFn, Value *Ident, Value *IfCondition,
1319 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1320 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1321 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1322 FunctionCallee RTLFn;
1323 if (IfCondition) {
1324 RTLFn =
1325 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1326 } else {
1327 RTLFn =
1328 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1329 }
1330 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1331 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1332 LLVMContext &Ctx = F->getContext();
1333 MDBuilder MDB(Ctx);
1334 // Annotate the callback behavior of the __kmpc_fork_call:
1335 // - The callback callee is argument number 2 (microtask).
1336 // - The first two arguments of the callback callee are unknown (-1).
1337 // - All variadic arguments to the __kmpc_fork_call are passed to the
1338 // callback callee.
1339 F->addMetadata(LLVMContext::MD_callback,
1341 2, {-1, -1},
1342 /* VarArgsArePassed */ true)}));
1343 }
1344 }
1345 // Add some known attributes.
1346 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1347 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1348 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1349
1350 assert(OutlinedFn.arg_size() >= 2 &&
1351 "Expected at least tid and bounded tid as arguments");
1352 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1353
1354 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1355 CI->getParent()->setName("omp_parallel");
1356 Builder.SetInsertPoint(CI);
1357
1358 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1359 Value *ForkCallArgs[] = {
1360 Ident, Builder.getInt32(NumCapturedVars),
1361 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr)};
1362
1363 SmallVector<Value *, 16> RealArgs;
1364 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1365 if (IfCondition) {
1366 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1367 RealArgs.push_back(Cond);
1368 }
1369 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1370
1371 // __kmpc_fork_call_if always expects a void ptr as the last argument
1372 // If there are no arguments, pass a null pointer.
1373 auto PtrTy = OMPIRBuilder->VoidPtr;
1374 if (IfCondition && NumCapturedVars == 0) {
1375 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1376 RealArgs.push_back(NullPtrValue);
1377 }
1378 if (IfCondition && RealArgs.back()->getType() != PtrTy)
1379 RealArgs.back() = Builder.CreateBitCast(RealArgs.back(), PtrTy);
1380
1381 Builder.CreateCall(RTLFn, RealArgs);
1382
1383 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1384 << *Builder.GetInsertBlock()->getParent() << "\n");
1385
1386 // Initialize the local TID stack location with the argument value.
1387 Builder.SetInsertPoint(PrivTID);
1388 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1389 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1390 PrivTIDAddr);
1391
1392 // Remove redundant call to the outlined function.
1393 CI->eraseFromParent();
1394
1395 for (Instruction *I : ToBeDeleted) {
1396 I->eraseFromParent();
1397 }
1398}
1399
1401 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1402 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1403 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1404 omp::ProcBindKind ProcBind, bool IsCancellable) {
1405 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1406
1407 if (!updateToLocation(Loc))
1408 return Loc.IP;
1409
1410 uint32_t SrcLocStrSize;
1411 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1412 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1413 Value *ThreadID = getOrCreateThreadID(Ident);
1414 // If we generate code for the target device, we need to allocate
1415 // struct for aggregate params in the device default alloca address space.
1416 // OpenMP runtime requires that the params of the extracted functions are
1417 // passed as zero address space pointers. This flag ensures that extracted
1418 // function arguments are declared in zero address space
1419 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1420
1421 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1422 // only if we compile for host side.
1423 if (NumThreads && !Config.isTargetDevice()) {
1424 Value *Args[] = {
1425 Ident, ThreadID,
1426 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1428 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1429 }
1430
1431 if (ProcBind != OMP_PROC_BIND_default) {
1432 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1433 Value *Args[] = {
1434 Ident, ThreadID,
1435 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1437 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1438 }
1439
1440 BasicBlock *InsertBB = Builder.GetInsertBlock();
1441 Function *OuterFn = InsertBB->getParent();
1442
1443 // Save the outer alloca block because the insertion iterator may get
1444 // invalidated and we still need this later.
1445 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1446
1447 // Vector to remember instructions we used only during the modeling but which
1448 // we want to delete at the end.
1450
1451 // Change the location to the outer alloca insertion point to create and
1452 // initialize the allocas we pass into the parallel region.
1453 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1454 Builder.restoreIP(NewOuter);
1455 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1456 AllocaInst *ZeroAddrAlloca =
1457 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1458 Instruction *TIDAddr = TIDAddrAlloca;
1459 Instruction *ZeroAddr = ZeroAddrAlloca;
1460 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1461 // Add additional casts to enforce pointers in zero address space
1462 TIDAddr = new AddrSpaceCastInst(
1463 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1464 TIDAddr->insertAfter(TIDAddrAlloca);
1465 ToBeDeleted.push_back(TIDAddr);
1466 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1467 PointerType ::get(M.getContext(), 0),
1468 "zero.addr.ascast");
1469 ZeroAddr->insertAfter(ZeroAddrAlloca);
1470 ToBeDeleted.push_back(ZeroAddr);
1471 }
1472
1473 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1474 // associated arguments in the outlined function, so we delete them later.
1475 ToBeDeleted.push_back(TIDAddrAlloca);
1476 ToBeDeleted.push_back(ZeroAddrAlloca);
1477
1478 // Create an artificial insertion point that will also ensure the blocks we
1479 // are about to split are not degenerated.
1480 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1481
1482 BasicBlock *EntryBB = UI->getParent();
1483 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1484 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1485 BasicBlock *PRegPreFiniBB =
1486 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1487 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1488
1489 auto FiniCBWrapper = [&](InsertPointTy IP) {
1490 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1491 // target to the region exit block.
1492 if (IP.getBlock()->end() == IP.getPoint()) {
1494 Builder.restoreIP(IP);
1495 Instruction *I = Builder.CreateBr(PRegExitBB);
1496 IP = InsertPointTy(I->getParent(), I->getIterator());
1497 }
1498 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1499 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1500 "Unexpected insertion point for finalization call!");
1501 return FiniCB(IP);
1502 };
1503
1504 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1505
1506 // Generate the privatization allocas in the block that will become the entry
1507 // of the outlined function.
1508 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1509 InsertPointTy InnerAllocaIP = Builder.saveIP();
1510
1511 AllocaInst *PrivTIDAddr =
1512 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1513 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1514
1515 // Add some fake uses for OpenMP provided arguments.
1516 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1517 Instruction *ZeroAddrUse =
1518 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1519 ToBeDeleted.push_back(ZeroAddrUse);
1520
1521 // EntryBB
1522 // |
1523 // V
1524 // PRegionEntryBB <- Privatization allocas are placed here.
1525 // |
1526 // V
1527 // PRegionBodyBB <- BodeGen is invoked here.
1528 // |
1529 // V
1530 // PRegPreFiniBB <- The block we will start finalization from.
1531 // |
1532 // V
1533 // PRegionExitBB <- A common exit to simplify block collection.
1534 //
1535
1536 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1537
1538 // Let the caller create the body.
1539 assert(BodyGenCB && "Expected body generation callback!");
1540 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1541 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1542 return Err;
1543
1544 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1545
1546 OutlineInfo OI;
1547 if (Config.isTargetDevice()) {
1548 // Generate OpenMP target specific runtime call
1549 OI.PostOutlineCB = [=, ToBeDeletedVec =
1550 std::move(ToBeDeleted)](Function &OutlinedFn) {
1551 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1552 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1553 ThreadID, ToBeDeletedVec);
1554 };
1555 } else {
1556 // Generate OpenMP host runtime call
1557 OI.PostOutlineCB = [=, ToBeDeletedVec =
1558 std::move(ToBeDeleted)](Function &OutlinedFn) {
1559 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1560 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1561 };
1562 }
1563
1564 OI.OuterAllocaBB = OuterAllocaBlock;
1565 OI.EntryBB = PRegEntryBB;
1566 OI.ExitBB = PRegExitBB;
1567
1568 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1570 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1571
1572 // Ensure a single exit node for the outlined region by creating one.
1573 // We might have multiple incoming edges to the exit now due to finalizations,
1574 // e.g., cancel calls that cause the control flow to leave the region.
1575 BasicBlock *PRegOutlinedExitBB = PRegExitBB;
1576 PRegExitBB = SplitBlock(PRegExitBB, &*PRegExitBB->getFirstInsertionPt());
1577 PRegOutlinedExitBB->setName("omp.par.outlined.exit");
1578 Blocks.push_back(PRegOutlinedExitBB);
1579
1580 CodeExtractorAnalysisCache CEAC(*OuterFn);
1581 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1582 /* AggregateArgs */ false,
1583 /* BlockFrequencyInfo */ nullptr,
1584 /* BranchProbabilityInfo */ nullptr,
1585 /* AssumptionCache */ nullptr,
1586 /* AllowVarArgs */ true,
1587 /* AllowAlloca */ true,
1588 /* AllocationBlock */ OuterAllocaBlock,
1589 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1590
1591 // Find inputs to, outputs from the code region.
1592 BasicBlock *CommonExit = nullptr;
1593 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1594 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1595
1596 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1597 /*CollectGlobalInputs=*/true);
1598
1599 Inputs.remove_if([&](Value *I) {
1600 if (auto *GV = dyn_cast_if_present<GlobalVariable>(I))
1601 return GV->getValueType() == OpenMPIRBuilder::Ident;
1602
1603 return false;
1604 });
1605
1606 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1607
1608 FunctionCallee TIDRTLFn =
1609 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1610
1611 auto PrivHelper = [&](Value &V) -> Error {
1612 if (&V == TIDAddr || &V == ZeroAddr) {
1613 OI.ExcludeArgsFromAggregate.push_back(&V);
1614 return Error::success();
1615 }
1616
1618 for (Use &U : V.uses())
1619 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1620 if (ParallelRegionBlockSet.count(UserI->getParent()))
1621 Uses.insert(&U);
1622
1623 // __kmpc_fork_call expects extra arguments as pointers. If the input
1624 // already has a pointer type, everything is fine. Otherwise, store the
1625 // value onto stack and load it back inside the to-be-outlined region. This
1626 // will ensure only the pointer will be passed to the function.
1627 // FIXME: if there are more than 15 trailing arguments, they must be
1628 // additionally packed in a struct.
1629 Value *Inner = &V;
1630 if (!V.getType()->isPointerTy()) {
1632 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1633
1634 Builder.restoreIP(OuterAllocaIP);
1635 Value *Ptr =
1636 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1637
1638 // Store to stack at end of the block that currently branches to the entry
1639 // block of the to-be-outlined region.
1640 Builder.SetInsertPoint(InsertBB,
1641 InsertBB->getTerminator()->getIterator());
1642 Builder.CreateStore(&V, Ptr);
1643
1644 // Load back next to allocations in the to-be-outlined region.
1645 Builder.restoreIP(InnerAllocaIP);
1646 Inner = Builder.CreateLoad(V.getType(), Ptr);
1647 }
1648
1649 Value *ReplacementValue = nullptr;
1650 CallInst *CI = dyn_cast<CallInst>(&V);
1651 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1652 ReplacementValue = PrivTID;
1653 } else {
1654 InsertPointOrErrorTy AfterIP =
1655 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1656 if (!AfterIP)
1657 return AfterIP.takeError();
1658 Builder.restoreIP(*AfterIP);
1659 InnerAllocaIP = {
1660 InnerAllocaIP.getBlock(),
1661 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1662
1663 assert(ReplacementValue &&
1664 "Expected copy/create callback to set replacement value!");
1665 if (ReplacementValue == &V)
1666 return Error::success();
1667 }
1668
1669 for (Use *UPtr : Uses)
1670 UPtr->set(ReplacementValue);
1671
1672 return Error::success();
1673 };
1674
1675 // Reset the inner alloca insertion as it will be used for loading the values
1676 // wrapped into pointers before passing them into the to-be-outlined region.
1677 // Configure it to insert immediately after the fake use of zero address so
1678 // that they are available in the generated body and so that the
1679 // OpenMP-related values (thread ID and zero address pointers) remain leading
1680 // in the argument list.
1681 InnerAllocaIP = IRBuilder<>::InsertPoint(
1682 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1683
1684 // Reset the outer alloca insertion point to the entry of the relevant block
1685 // in case it was invalidated.
1686 OuterAllocaIP = IRBuilder<>::InsertPoint(
1687 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1688
1689 for (Value *Input : Inputs) {
1690 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1691 if (Error Err = PrivHelper(*Input))
1692 return Err;
1693 }
1694 LLVM_DEBUG({
1695 for (Value *Output : Outputs)
1696 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1697 });
1698 assert(Outputs.empty() &&
1699 "OpenMP outlining should not produce live-out values!");
1700
1701 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1702 LLVM_DEBUG({
1703 for (auto *BB : Blocks)
1704 dbgs() << " PBR: " << BB->getName() << "\n";
1705 });
1706
1707 // Adjust the finalization stack, verify the adjustment, and call the
1708 // finalize function a last time to finalize values between the pre-fini
1709 // block and the exit block if we left the parallel "the normal way".
1710 auto FiniInfo = FinalizationStack.pop_back_val();
1711 (void)FiniInfo;
1712 assert(FiniInfo.DK == OMPD_parallel &&
1713 "Unexpected finalization stack state!");
1714
1715 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1716
1717 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1718 if (Error Err = FiniCB(PreFiniIP))
1719 return Err;
1720
1721 // Register the outlined info.
1722 addOutlineInfo(std::move(OI));
1723
1724 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1725 UI->eraseFromParent();
1726
1727 return AfterIP;
1728}
1729
1731 // Build call void __kmpc_flush(ident_t *loc)
1732 uint32_t SrcLocStrSize;
1733 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1734 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1735
1736 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1737}
1738
1740 if (!updateToLocation(Loc))
1741 return;
1742 emitFlush(Loc);
1743}
1744
1746 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1747 // global_tid);
1748 uint32_t SrcLocStrSize;
1749 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1750 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1751 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1752
1753 // Ignore return result until untied tasks are supported.
1754 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1755 Args);
1756}
1757
1759 if (!updateToLocation(Loc))
1760 return;
1761 emitTaskwaitImpl(Loc);
1762}
1763
1765 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1766 uint32_t SrcLocStrSize;
1767 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1768 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1770 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1771
1772 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1773 Args);
1774}
1775
1777 if (!updateToLocation(Loc))
1778 return;
1779 emitTaskyieldImpl(Loc);
1780}
1781
1782// Processes the dependencies in Dependencies and does the following
1783// - Allocates space on the stack of an array of DependInfo objects
1784// - Populates each DependInfo object with relevant information of
1785// the corresponding dependence.
1786// - All code is inserted in the entry block of the current function.
1788 OpenMPIRBuilder &OMPBuilder,
1790 // Early return if we have no dependencies to process
1791 if (Dependencies.empty())
1792 return nullptr;
1793
1794 // Given a vector of DependData objects, in this function we create an
1795 // array on the stack that holds kmp_dep_info objects corresponding
1796 // to each dependency. This is then passed to the OpenMP runtime.
1797 // For example, if there are 'n' dependencies then the following psedo
1798 // code is generated. Assume the first dependence is on a variable 'a'
1799 //
1800 // \code{c}
1801 // DepArray = alloc(n x sizeof(kmp_depend_info);
1802 // idx = 0;
1803 // DepArray[idx].base_addr = ptrtoint(&a);
1804 // DepArray[idx].len = 8;
1805 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1806 // ++idx;
1807 // DepArray[idx].base_addr = ...;
1808 // \endcode
1809
1810 IRBuilderBase &Builder = OMPBuilder.Builder;
1811 Type *DependInfo = OMPBuilder.DependInfo;
1812 Module &M = OMPBuilder.M;
1813
1814 Value *DepArray = nullptr;
1815 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1816 Builder.SetInsertPoint(
1818
1819 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1820 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1821
1822 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
1823 Value *Base =
1824 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
1825 // Store the pointer to the variable
1826 Value *Addr = Builder.CreateStructGEP(
1827 DependInfo, Base,
1828 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1829 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1830 Builder.CreateStore(DepValPtr, Addr);
1831 // Store the size of the variable
1832 Value *Size = Builder.CreateStructGEP(
1833 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
1834 Builder.CreateStore(
1835 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1836 Size);
1837 // Store the dependency kind
1838 Value *Flags = Builder.CreateStructGEP(
1839 DependInfo, Base,
1840 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1841 Builder.CreateStore(
1842 ConstantInt::get(Builder.getInt8Ty(),
1843 static_cast<unsigned int>(Dep.DepKind)),
1844 Flags);
1845 }
1846 Builder.restoreIP(OldIP);
1847 return DepArray;
1848}
1849
1851 const LocationDescription &Loc, InsertPointTy AllocaIP,
1852 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
1853 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle,
1854 Value *Priority) {
1855
1856 if (!updateToLocation(Loc))
1857 return InsertPointTy();
1858
1859 uint32_t SrcLocStrSize;
1860 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1861 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1862 // The current basic block is split into four basic blocks. After outlining,
1863 // they will be mapped as follows:
1864 // ```
1865 // def current_fn() {
1866 // current_basic_block:
1867 // br label %task.exit
1868 // task.exit:
1869 // ; instructions after task
1870 // }
1871 // def outlined_fn() {
1872 // task.alloca:
1873 // br label %task.body
1874 // task.body:
1875 // ret void
1876 // }
1877 // ```
1878 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1879 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1880 BasicBlock *TaskAllocaBB =
1881 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1882
1883 InsertPointTy TaskAllocaIP =
1884 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1885 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1886 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
1887 return Err;
1888
1889 OutlineInfo OI;
1890 OI.EntryBB = TaskAllocaBB;
1891 OI.OuterAllocaBB = AllocaIP.getBlock();
1892 OI.ExitBB = TaskExitBB;
1893
1894 // Add the thread ID argument.
1897 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1898
1899 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1900 Mergeable, Priority, EventHandle, TaskAllocaBB,
1901 ToBeDeleted](Function &OutlinedFn) mutable {
1902 // Replace the Stale CI by appropriate RTL function call.
1903 assert(OutlinedFn.getNumUses() == 1 &&
1904 "there must be a single user for the outlined function");
1905 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1906
1907 // HasShareds is true if any variables are captured in the outlined region,
1908 // false otherwise.
1909 bool HasShareds = StaleCI->arg_size() > 1;
1910 Builder.SetInsertPoint(StaleCI);
1911
1912 // Gather the arguments for emitting the runtime call for
1913 // @__kmpc_omp_task_alloc
1914 Function *TaskAllocFn =
1915 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
1916
1917 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
1918 // call.
1919 Value *ThreadID = getOrCreateThreadID(Ident);
1920
1921 // Argument - `flags`
1922 // Task is tied iff (Flags & 1) == 1.
1923 // Task is untied iff (Flags & 1) == 0.
1924 // Task is final iff (Flags & 2) == 2.
1925 // Task is not final iff (Flags & 2) == 0.
1926 // Task is mergeable iff (Flags & 4) == 4.
1927 // Task is not mergeable iff (Flags & 4) == 0.
1928 // Task is priority iff (Flags & 32) == 32.
1929 // Task is not priority iff (Flags & 32) == 0.
1930 // TODO: Handle the other flags.
1931 Value *Flags = Builder.getInt32(Tied);
1932 if (Final) {
1933 Value *FinalFlag =
1935 Flags = Builder.CreateOr(FinalFlag, Flags);
1936 }
1937
1938 if (Mergeable)
1940 if (Priority)
1942
1943 // Argument - `sizeof_kmp_task_t` (TaskSize)
1944 // Tasksize refers to the size in bytes of kmp_task_t data structure
1945 // including private vars accessed in task.
1946 // TODO: add kmp_task_t_with_privates (privates)
1947 Value *TaskSize = Builder.getInt64(
1949
1950 // Argument - `sizeof_shareds` (SharedsSize)
1951 // SharedsSize refers to the shareds array size in the kmp_task_t data
1952 // structure.
1953 Value *SharedsSize = Builder.getInt64(0);
1954 if (HasShareds) {
1955 AllocaInst *ArgStructAlloca =
1956 dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
1957 assert(ArgStructAlloca &&
1958 "Unable to find the alloca instruction corresponding to arguments "
1959 "for extracted function");
1960 StructType *ArgStructType =
1961 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
1962 assert(ArgStructType && "Unable to find struct type corresponding to "
1963 "arguments for extracted function");
1964 SharedsSize =
1966 }
1967 // Emit the @__kmpc_omp_task_alloc runtime call
1968 // The runtime call returns a pointer to an area where the task captured
1969 // variables must be copied before the task is run (TaskData)
1970 CallInst *TaskData = Builder.CreateCall(
1971 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
1972 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
1973 /*task_func=*/&OutlinedFn});
1974
1975 // Emit detach clause initialization.
1976 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
1977 // task_descriptor);
1978 if (EventHandle) {
1980 OMPRTL___kmpc_task_allow_completion_event);
1981 llvm::Value *EventVal =
1982 Builder.CreateCall(TaskDetachFn, {Ident, ThreadID, TaskData});
1983 llvm::Value *EventHandleAddr =
1985 Builder.getPtrTy(0));
1986 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
1987 Builder.CreateStore(EventVal, EventHandleAddr);
1988 }
1989 // Copy the arguments for outlined function
1990 if (HasShareds) {
1991 Value *Shareds = StaleCI->getArgOperand(1);
1992 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
1993 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
1994 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
1995 SharedsSize);
1996 }
1997
1998 if (Priority) {
1999 //
2000 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2001 // we populate the priority information into the "kmp_task_t" here
2002 //
2003 // The struct "kmp_task_t" definition is available in kmp.h
2004 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2005 // data2 is used for priority
2006 //
2007 Type *Int32Ty = Builder.getInt32Ty();
2008 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2009 // kmp_task_t* => { ptr }
2010 Type *TaskPtr = StructType::get(VoidPtr);
2011 Value *TaskGEP =
2012 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2013 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2014 Type *TaskStructType = StructType::get(
2015 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2016 Value *PriorityData = Builder.CreateInBoundsGEP(
2017 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2018 // kmp_cmplrdata_t => { ptr, ptr }
2019 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2020 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2021 PriorityData, {Zero, Zero});
2022 Builder.CreateStore(Priority, CmplrData);
2023 }
2024
2025 Value *DepArray = nullptr;
2026 if (Dependencies.size()) {
2027 InsertPointTy OldIP = Builder.saveIP();
2029 &OldIP.getBlock()->getParent()->getEntryBlock().back());
2030
2031 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
2032 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
2033
2034 unsigned P = 0;
2035 for (const DependData &Dep : Dependencies) {
2036 Value *Base =
2037 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, P);
2038 // Store the pointer to the variable
2040 DependInfo, Base,
2041 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
2042 Value *DepValPtr =
2044 Builder.CreateStore(DepValPtr, Addr);
2045 // Store the size of the variable
2047 DependInfo, Base,
2048 static_cast<unsigned int>(RTLDependInfoFields::Len));
2050 Dep.DepValueType)),
2051 Size);
2052 // Store the dependency kind
2054 DependInfo, Base,
2055 static_cast<unsigned int>(RTLDependInfoFields::Flags));
2057 ConstantInt::get(Builder.getInt8Ty(),
2058 static_cast<unsigned int>(Dep.DepKind)),
2059 Flags);
2060 ++P;
2061 }
2062
2063 Builder.restoreIP(OldIP);
2064 }
2065
2066 // In the presence of the `if` clause, the following IR is generated:
2067 // ...
2068 // %data = call @__kmpc_omp_task_alloc(...)
2069 // br i1 %if_condition, label %then, label %else
2070 // then:
2071 // call @__kmpc_omp_task(...)
2072 // br label %exit
2073 // else:
2074 // ;; Wait for resolution of dependencies, if any, before
2075 // ;; beginning the task
2076 // call @__kmpc_omp_wait_deps(...)
2077 // call @__kmpc_omp_task_begin_if0(...)
2078 // call @outlined_fn(...)
2079 // call @__kmpc_omp_task_complete_if0(...)
2080 // br label %exit
2081 // exit:
2082 // ...
2083 if (IfCondition) {
2084 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2085 // terminator.
2086 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2087 Instruction *IfTerminator =
2088 Builder.GetInsertPoint()->getParent()->getTerminator();
2089 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2090 Builder.SetInsertPoint(IfTerminator);
2091 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2092 &ElseTI);
2093 Builder.SetInsertPoint(ElseTI);
2094
2095 if (Dependencies.size()) {
2096 Function *TaskWaitFn =
2097 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2099 TaskWaitFn,
2100 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2101 ConstantInt::get(Builder.getInt32Ty(), 0),
2103 }
2104 Function *TaskBeginFn =
2105 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2106 Function *TaskCompleteFn =
2107 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2108 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2109 CallInst *CI = nullptr;
2110 if (HasShareds)
2111 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
2112 else
2113 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
2114 CI->setDebugLoc(StaleCI->getDebugLoc());
2115 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2116 Builder.SetInsertPoint(ThenTI);
2117 }
2118
2119 if (Dependencies.size()) {
2120 Function *TaskFn =
2121 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2123 TaskFn,
2124 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2125 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2127
2128 } else {
2129 // Emit the @__kmpc_omp_task runtime call to spawn the task
2130 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2131 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
2132 }
2133
2134 StaleCI->eraseFromParent();
2135
2136 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2137 if (HasShareds) {
2138 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2139 OutlinedFn.getArg(1)->replaceUsesWithIf(
2140 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2141 }
2142
2143 for (Instruction *I : llvm::reverse(ToBeDeleted))
2144 I->eraseFromParent();
2145 };
2146
2147 addOutlineInfo(std::move(OI));
2148 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2149
2150 return Builder.saveIP();
2151}
2152
2155 InsertPointTy AllocaIP,
2156 BodyGenCallbackTy BodyGenCB) {
2157 if (!updateToLocation(Loc))
2158 return InsertPointTy();
2159
2160 uint32_t SrcLocStrSize;
2161 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2162 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2163 Value *ThreadID = getOrCreateThreadID(Ident);
2164
2165 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2166 Function *TaskgroupFn =
2167 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2168 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2169
2170 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2171 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2172 return Err;
2173
2174 Builder.SetInsertPoint(TaskgroupExitBB);
2175 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2176 Function *EndTaskgroupFn =
2177 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2178 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2179
2180 return Builder.saveIP();
2181}
2182
2184 const LocationDescription &Loc, InsertPointTy AllocaIP,
2186 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2187 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2188
2189 if (!updateToLocation(Loc))
2190 return Loc.IP;
2191
2192 auto FiniCBWrapper = [&](InsertPointTy IP) {
2193 if (IP.getBlock()->end() != IP.getPoint())
2194 return FiniCB(IP);
2195 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2196 // will fail because that function requires the Finalization Basic Block to
2197 // have a terminator, which is already removed by EmitOMPRegionBody.
2198 // IP is currently at cancelation block.
2199 // We need to backtrack to the condition block to fetch
2200 // the exit block and create a branch from cancelation
2201 // to exit block.
2203 Builder.restoreIP(IP);
2204 auto *CaseBB = IP.getBlock()->getSinglePredecessor();
2205 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2206 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2207 Instruction *I = Builder.CreateBr(ExitBB);
2208 IP = InsertPointTy(I->getParent(), I->getIterator());
2209 return FiniCB(IP);
2210 };
2211
2212 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2213
2214 // Each section is emitted as a switch case
2215 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2216 // -> OMP.createSection() which generates the IR for each section
2217 // Iterate through all sections and emit a switch construct:
2218 // switch (IV) {
2219 // case 0:
2220 // <SectionStmt[0]>;
2221 // break;
2222 // ...
2223 // case <NumSection> - 1:
2224 // <SectionStmt[<NumSection> - 1]>;
2225 // break;
2226 // }
2227 // ...
2228 // section_loop.after:
2229 // <FiniCB>;
2230 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2231 Builder.restoreIP(CodeGenIP);
2233 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2234 Function *CurFn = Continue->getParent();
2235 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2236
2237 unsigned CaseNumber = 0;
2238 for (auto SectionCB : SectionCBs) {
2240 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2241 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2242 Builder.SetInsertPoint(CaseBB);
2243 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2244 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2245 CaseEndBr->getIterator()}))
2246 return Err;
2247 CaseNumber++;
2248 }
2249 // remove the existing terminator from body BB since there can be no
2250 // terminators after switch/case
2251 return Error::success();
2252 };
2253 // Loop body ends here
2254 // LowerBound, UpperBound, and STride for createCanonicalLoop
2255 Type *I32Ty = Type::getInt32Ty(M.getContext());
2256 Value *LB = ConstantInt::get(I32Ty, 0);
2257 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2258 Value *ST = ConstantInt::get(I32Ty, 1);
2260 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2261 if (!LoopInfo)
2262 return LoopInfo.takeError();
2263
2264 InsertPointOrErrorTy WsloopIP =
2265 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP, !IsNowait);
2266 if (!WsloopIP)
2267 return WsloopIP.takeError();
2268 InsertPointTy AfterIP = *WsloopIP;
2269
2270 // Apply the finalization callback in LoopAfterBB
2271 auto FiniInfo = FinalizationStack.pop_back_val();
2272 assert(FiniInfo.DK == OMPD_sections &&
2273 "Unexpected finalization stack state!");
2274 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2275 Builder.restoreIP(AfterIP);
2276 BasicBlock *FiniBB =
2277 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2278 if (Error Err = CB(Builder.saveIP()))
2279 return Err;
2280 AfterIP = {FiniBB, FiniBB->begin()};
2281 }
2282
2283 return AfterIP;
2284}
2285
2288 BodyGenCallbackTy BodyGenCB,
2289 FinalizeCallbackTy FiniCB) {
2290 if (!updateToLocation(Loc))
2291 return Loc.IP;
2292
2293 auto FiniCBWrapper = [&](InsertPointTy IP) {
2294 if (IP.getBlock()->end() != IP.getPoint())
2295 return FiniCB(IP);
2296 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2297 // will fail because that function requires the Finalization Basic Block to
2298 // have a terminator, which is already removed by EmitOMPRegionBody.
2299 // IP is currently at cancelation block.
2300 // We need to backtrack to the condition block to fetch
2301 // the exit block and create a branch from cancelation
2302 // to exit block.
2304 Builder.restoreIP(IP);
2305 auto *CaseBB = Loc.IP.getBlock();
2306 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2307 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2308 Instruction *I = Builder.CreateBr(ExitBB);
2309 IP = InsertPointTy(I->getParent(), I->getIterator());
2310 return FiniCB(IP);
2311 };
2312
2313 Directive OMPD = Directive::OMPD_sections;
2314 // Since we are using Finalization Callback here, HasFinalize
2315 // and IsCancellable have to be true
2316 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2317 /*Conditional*/ false, /*hasFinalize*/ true,
2318 /*IsCancellable*/ true);
2319}
2320
2323 IT++;
2324 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2325}
2326
2327Value *OpenMPIRBuilder::getGPUThreadID() {
2328 return Builder.CreateCall(
2330 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2331 {});
2332}
2333
2334Value *OpenMPIRBuilder::getGPUWarpSize() {
2335 return Builder.CreateCall(
2336 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2337}
2338
2339Value *OpenMPIRBuilder::getNVPTXWarpID() {
2340 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2341 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2342}
2343
2344Value *OpenMPIRBuilder::getNVPTXLaneID() {
2345 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2346 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2347 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2348 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2349 "nvptx_lane_id");
2350}
2351
2352Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2353 Type *ToType) {
2354 Type *FromType = From->getType();
2355 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2356 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2357 assert(FromSize > 0 && "From size must be greater than zero");
2358 assert(ToSize > 0 && "To size must be greater than zero");
2359 if (FromType == ToType)
2360 return From;
2361 if (FromSize == ToSize)
2362 return Builder.CreateBitCast(From, ToType);
2363 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2364 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2365 InsertPointTy SaveIP = Builder.saveIP();
2366 Builder.restoreIP(AllocaIP);
2367 Value *CastItem = Builder.CreateAlloca(ToType);
2368 Builder.restoreIP(SaveIP);
2369
2371 CastItem, Builder.getPtrTy(0));
2372 Builder.CreateStore(From, ValCastItem);
2373 return Builder.CreateLoad(ToType, CastItem);
2374}
2375
2376Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2377 Value *Element,
2378 Type *ElementType,
2379 Value *Offset) {
2380 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2381 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2382
2383 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2384 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2385 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2386 Value *WarpSize =
2387 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2389 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2390 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2391 Value *WarpSizeCast =
2392 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2393 Value *ShuffleCall =
2394 Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2395 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2396}
2397
2398void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2399 Value *DstAddr, Type *ElemType,
2400 Value *Offset, Type *ReductionArrayTy) {
2402 // Create the loop over the big sized data.
2403 // ptr = (void*)Elem;
2404 // ptrEnd = (void*) Elem + 1;
2405 // Step = 8;
2406 // while (ptr + Step < ptrEnd)
2407 // shuffle((int64_t)*ptr);
2408 // Step = 4;
2409 // while (ptr + Step < ptrEnd)
2410 // shuffle((int32_t)*ptr);
2411 // ...
2412 Type *IndexTy = Builder.getIndexTy(
2414 Value *ElemPtr = DstAddr;
2415 Value *Ptr = SrcAddr;
2416 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2417 if (Size < IntSize)
2418 continue;
2419 Type *IntType = Builder.getIntNTy(IntSize * 8);
2421 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2422 Value *SrcAddrGEP =
2423 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2425 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2426
2427 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2428 if ((Size / IntSize) > 1) {
2430 SrcAddrGEP, Builder.getPtrTy());
2431 BasicBlock *PreCondBB =
2432 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2433 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2434 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2435 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2436 emitBlock(PreCondBB, CurFunc);
2437 PHINode *PhiSrc =
2438 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2439 PhiSrc->addIncoming(Ptr, CurrentBB);
2440 PHINode *PhiDest =
2441 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2442 PhiDest->addIncoming(ElemPtr, CurrentBB);
2443 Ptr = PhiSrc;
2444 ElemPtr = PhiDest;
2445 Value *PtrDiff = Builder.CreatePtrDiff(
2446 Builder.getInt8Ty(), PtrEnd,
2449 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2450 ExitBB);
2451 emitBlock(ThenBB, CurFunc);
2452 Value *Res = createRuntimeShuffleFunction(
2453 AllocaIP,
2455 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2456 IntType, Offset);
2457 Builder.CreateAlignedStore(Res, ElemPtr,
2458 M.getDataLayout().getPrefTypeAlign(ElemType));
2459 Value *LocalPtr =
2460 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2461 Value *LocalElemPtr =
2462 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2463 PhiSrc->addIncoming(LocalPtr, ThenBB);
2464 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2465 emitBranch(PreCondBB);
2466 emitBlock(ExitBB, CurFunc);
2467 } else {
2468 Value *Res = createRuntimeShuffleFunction(
2469 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2470 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2471 Res->getType()->getScalarSizeInBits())
2472 Res = Builder.CreateTrunc(Res, ElemType);
2473 Builder.CreateStore(Res, ElemPtr);
2474 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2475 ElemPtr =
2476 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2477 }
2478 Size = Size % IntSize;
2479 }
2480}
2481
2482void OpenMPIRBuilder::emitReductionListCopy(
2483 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2484 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2485 CopyOptionsTy CopyOptions) {
2486 Type *IndexTy = Builder.getIndexTy(
2488 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2489
2490 // Iterates, element-by-element, through the source Reduce list and
2491 // make a copy.
2492 for (auto En : enumerate(ReductionInfos)) {
2493 const ReductionInfo &RI = En.value();
2494 Value *SrcElementAddr = nullptr;
2495 Value *DestElementAddr = nullptr;
2496 Value *DestElementPtrAddr = nullptr;
2497 // Should we shuffle in an element from a remote lane?
2498 bool ShuffleInElement = false;
2499 // Set to true to update the pointer in the dest Reduce list to a
2500 // newly created element.
2501 bool UpdateDestListPtr = false;
2502
2503 // Step 1.1: Get the address for the src element in the Reduce list.
2504 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2505 ReductionArrayTy, SrcBase,
2506 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2507 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2508
2509 // Step 1.2: Create a temporary to store the element in the destination
2510 // Reduce list.
2511 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2512 ReductionArrayTy, DestBase,
2513 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2514 switch (Action) {
2516 InsertPointTy CurIP = Builder.saveIP();
2517 Builder.restoreIP(AllocaIP);
2518 AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
2519 ".omp.reduction.element");
2520 DestAlloca->setAlignment(
2521 M.getDataLayout().getPrefTypeAlign(RI.ElementType));
2522 DestElementAddr = DestAlloca;
2523 DestElementAddr =
2524 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2525 DestElementAddr->getName() + ".ascast");
2526 Builder.restoreIP(CurIP);
2527 ShuffleInElement = true;
2528 UpdateDestListPtr = true;
2529 break;
2530 }
2532 DestElementAddr =
2533 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2534 break;
2535 }
2536 }
2537
2538 // Now that all active lanes have read the element in the
2539 // Reduce list, shuffle over the value from the remote lane.
2540 if (ShuffleInElement) {
2541 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
2542 RemoteLaneOffset, ReductionArrayTy);
2543 } else {
2544 switch (RI.EvaluationKind) {
2545 case EvalKind::Scalar: {
2546 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2547 // Store the source element value to the dest element address.
2548 Builder.CreateStore(Elem, DestElementAddr);
2549 break;
2550 }
2551 case EvalKind::Complex: {
2553 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2554 Value *SrcReal = Builder.CreateLoad(
2555 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2557 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2558 Value *SrcImg = Builder.CreateLoad(
2559 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2560
2562 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2564 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2565 Builder.CreateStore(SrcReal, DestRealPtr);
2566 Builder.CreateStore(SrcImg, DestImgPtr);
2567 break;
2568 }
2569 case EvalKind::Aggregate: {
2570 Value *SizeVal = Builder.getInt64(
2571 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2573 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2574 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2575 SizeVal, false);
2576 break;
2577 }
2578 };
2579 }
2580
2581 // Step 3.1: Modify reference in dest Reduce list as needed.
2582 // Modifying the reference in Reduce list to point to the newly
2583 // created element. The element is live in the current function
2584 // scope and that of functions it invokes (i.e., reduce_function).
2585 // RemoteReduceData[i] = (void*)&RemoteElem
2586 if (UpdateDestListPtr) {
2588 DestElementAddr, Builder.getPtrTy(),
2589 DestElementAddr->getName() + ".ascast");
2590 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2591 }
2592 }
2593}
2594
2595Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
2596 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2597 AttributeList FuncAttrs) {
2598 InsertPointTy SavedIP = Builder.saveIP();
2599 LLVMContext &Ctx = M.getContext();
2601 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2602 /* IsVarArg */ false);
2603 Function *WcFunc =
2605 "_omp_reduction_inter_warp_copy_func", &M);
2606 WcFunc->setAttributes(FuncAttrs);
2607 WcFunc->addParamAttr(0, Attribute::NoUndef);
2608 WcFunc->addParamAttr(1, Attribute::NoUndef);
2609 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2610 Builder.SetInsertPoint(EntryBB);
2611
2612 // ReduceList: thread local Reduce list.
2613 // At the stage of the computation when this function is called, partially
2614 // aggregated values reside in the first lane of every active warp.
2615 Argument *ReduceListArg = WcFunc->getArg(0);
2616 // NumWarps: number of warps active in the parallel region. This could
2617 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2618 Argument *NumWarpsArg = WcFunc->getArg(1);
2619
2620 // This array is used as a medium to transfer, one reduce element at a time,
2621 // the data from the first lane of every warp to lanes in the first warp
2622 // in order to perform the final step of a reduction in a parallel region
2623 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2624 // for reduced latency, as well as to have a distinct copy for concurrently
2625 // executing target regions. The array is declared with common linkage so
2626 // as to be shared across compilation units.
2627 StringRef TransferMediumName =
2628 "__openmp_nvptx_data_transfer_temporary_storage";
2629 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2630 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2631 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2632 if (!TransferMedium) {
2633 TransferMedium = new GlobalVariable(
2634 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2635 UndefValue::get(ArrayTy), TransferMediumName,
2636 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2637 /*AddressSpace=*/3);
2638 }
2639
2640 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2641 Value *GPUThreadID = getGPUThreadID();
2642 // nvptx_lane_id = nvptx_id % warpsize
2643 Value *LaneID = getNVPTXLaneID();
2644 // nvptx_warp_id = nvptx_id / warpsize
2645 Value *WarpID = getNVPTXWarpID();
2646
2647 InsertPointTy AllocaIP =
2650 Type *Arg0Type = ReduceListArg->getType();
2651 Type *Arg1Type = NumWarpsArg->getType();
2652 Builder.restoreIP(AllocaIP);
2653 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2654 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2655 AllocaInst *NumWarpsAlloca =
2656 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2658 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2660 NumWarpsAlloca, Builder.getPtrTy(0),
2661 NumWarpsAlloca->getName() + ".ascast");
2662 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2663 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2664 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2665 InsertPointTy CodeGenIP =
2667 Builder.restoreIP(CodeGenIP);
2668
2669 Value *ReduceList =
2670 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2671
2672 for (auto En : enumerate(ReductionInfos)) {
2673 //
2674 // Warp master copies reduce element to transfer medium in __shared__
2675 // memory.
2676 //
2677 const ReductionInfo &RI = En.value();
2678 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
2679 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2680 Type *CType = Builder.getIntNTy(TySize * 8);
2681
2682 unsigned NumIters = RealTySize / TySize;
2683 if (NumIters == 0)
2684 continue;
2685 Value *Cnt = nullptr;
2686 Value *CntAddr = nullptr;
2687 BasicBlock *PrecondBB = nullptr;
2688 BasicBlock *ExitBB = nullptr;
2689 if (NumIters > 1) {
2690 CodeGenIP = Builder.saveIP();
2691 Builder.restoreIP(AllocaIP);
2692 CntAddr =
2693 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2694
2695 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2696 CntAddr->getName() + ".ascast");
2697 Builder.restoreIP(CodeGenIP);
2699 CntAddr,
2700 /*Volatile=*/false);
2701 PrecondBB = BasicBlock::Create(Ctx, "precond");
2702 ExitBB = BasicBlock::Create(Ctx, "exit");
2703 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2704 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2705 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2706 /*Volatile=*/false);
2708 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2709 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2711 }
2712
2713 // kmpc_barrier.
2714 InsertPointOrErrorTy BarrierIP1 =
2715 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2716 omp::Directive::OMPD_unknown,
2717 /* ForceSimpleCall */ false,
2718 /* CheckCancelFlag */ true);
2719 if (!BarrierIP1)
2720 return BarrierIP1.takeError();
2721 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2722 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2723 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2724
2725 // if (lane_id == 0)
2726 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2727 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2729
2730 // Reduce element = LocalReduceList[i]
2731 auto *RedListArrayTy =
2732 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2733 Type *IndexTy = Builder.getIndexTy(
2735 Value *ElemPtrPtr =
2736 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2737 {ConstantInt::get(IndexTy, 0),
2738 ConstantInt::get(IndexTy, En.index())});
2739 // elemptr = ((CopyType*)(elemptrptr)) + I
2740 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2741 if (NumIters > 1)
2742 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2743
2744 // Get pointer to location in transfer medium.
2745 // MediumPtr = &medium[warp_id]
2746 Value *MediumPtr = Builder.CreateInBoundsGEP(
2747 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2748 // elem = *elemptr
2749 //*MediumPtr = elem
2750 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2751 // Store the source element value to the dest element address.
2752 Builder.CreateStore(Elem, MediumPtr,
2753 /*IsVolatile*/ true);
2754 Builder.CreateBr(MergeBB);
2755
2756 // else
2758 Builder.CreateBr(MergeBB);
2759
2760 // endif
2762 InsertPointOrErrorTy BarrierIP2 =
2763 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2764 omp::Directive::OMPD_unknown,
2765 /* ForceSimpleCall */ false,
2766 /* CheckCancelFlag */ true);
2767 if (!BarrierIP2)
2768 return BarrierIP2.takeError();
2769
2770 // Warp 0 copies reduce element from transfer medium
2771 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2772 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2773 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2774
2775 Value *NumWarpsVal =
2776 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2777 // Up to 32 threads in warp 0 are active.
2778 Value *IsActiveThread =
2779 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2780 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2781
2782 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2783
2784 // SecMediumPtr = &medium[tid]
2785 // SrcMediumVal = *SrcMediumPtr
2786 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2787 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2788 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2789 Value *TargetElemPtrPtr =
2790 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2791 {ConstantInt::get(IndexTy, 0),
2792 ConstantInt::get(IndexTy, En.index())});
2793 Value *TargetElemPtrVal =
2794 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2795 Value *TargetElemPtr = TargetElemPtrVal;
2796 if (NumIters > 1)
2797 TargetElemPtr =
2798 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2799
2800 // *TargetElemPtr = SrcMediumVal;
2801 Value *SrcMediumValue =
2802 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2803 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2804 Builder.CreateBr(W0MergeBB);
2805
2806 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2807 Builder.CreateBr(W0MergeBB);
2808
2809 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2810
2811 if (NumIters > 1) {
2812 Cnt = Builder.CreateNSWAdd(
2813 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2814 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2815
2816 auto *CurFn = Builder.GetInsertBlock()->getParent();
2817 emitBranch(PrecondBB);
2818 emitBlock(ExitBB, CurFn);
2819 }
2820 RealTySize %= TySize;
2821 }
2822 }
2823
2825 Builder.restoreIP(SavedIP);
2826
2827 return WcFunc;
2828}
2829
2830Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
2831 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2832 AttributeList FuncAttrs) {
2833 LLVMContext &Ctx = M.getContext();
2834 FunctionType *FuncTy =
2836 {Builder.getPtrTy(), Builder.getInt16Ty(),
2837 Builder.getInt16Ty(), Builder.getInt16Ty()},
2838 /* IsVarArg */ false);
2839 Function *SarFunc =
2841 "_omp_reduction_shuffle_and_reduce_func", &M);
2842 SarFunc->setAttributes(FuncAttrs);
2843 SarFunc->addParamAttr(0, Attribute::NoUndef);
2844 SarFunc->addParamAttr(1, Attribute::NoUndef);
2845 SarFunc->addParamAttr(2, Attribute::NoUndef);
2846 SarFunc->addParamAttr(3, Attribute::NoUndef);
2847 SarFunc->addParamAttr(1, Attribute::SExt);
2848 SarFunc->addParamAttr(2, Attribute::SExt);
2849 SarFunc->addParamAttr(3, Attribute::SExt);
2850 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
2851 Builder.SetInsertPoint(EntryBB);
2852
2853 // Thread local Reduce list used to host the values of data to be reduced.
2854 Argument *ReduceListArg = SarFunc->getArg(0);
2855 // Current lane id; could be logical.
2856 Argument *LaneIDArg = SarFunc->getArg(1);
2857 // Offset of the remote source lane relative to the current lane.
2858 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
2859 // Algorithm version. This is expected to be known at compile time.
2860 Argument *AlgoVerArg = SarFunc->getArg(3);
2861
2862 Type *ReduceListArgType = ReduceListArg->getType();
2863 Type *LaneIDArgType = LaneIDArg->getType();
2864 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
2865 Value *ReduceListAlloca = Builder.CreateAlloca(
2866 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
2867 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2868 LaneIDArg->getName() + ".addr");
2869 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
2870 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
2871 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2872 AlgoVerArg->getName() + ".addr");
2873 ArrayType *RedListArrayTy =
2874 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2875
2876 // Create a local thread-private variable to host the Reduce list
2877 // from a remote lane.
2878 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
2879 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
2880
2882 ReduceListAlloca, ReduceListArgType,
2883 ReduceListAlloca->getName() + ".ascast");
2885 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
2886 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2887 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
2888 RemoteLaneOffsetAlloca->getName() + ".ascast");
2890 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
2892 RemoteReductionListAlloca, Builder.getPtrTy(),
2893 RemoteReductionListAlloca->getName() + ".ascast");
2894
2895 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2896 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
2897 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
2898 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
2899
2900 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
2901 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
2902 Value *RemoteLaneOffset =
2903 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
2904 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
2905
2906 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
2907
2908 // This loop iterates through the list of reduce elements and copies,
2909 // element by element, from a remote lane in the warp to RemoteReduceList,
2910 // hosted on the thread's stack.
2911 emitReductionListCopy(
2912 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
2913 ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
2914
2915 // The actions to be performed on the Remote Reduce list is dependent
2916 // on the algorithm version.
2917 //
2918 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2919 // LaneId % 2 == 0 && Offset > 0):
2920 // do the reduction value aggregation
2921 //
2922 // The thread local variable Reduce list is mutated in place to host the
2923 // reduced data, which is the aggregated value produced from local and
2924 // remote lanes.
2925 //
2926 // Note that AlgoVer is expected to be a constant integer known at compile
2927 // time.
2928 // When AlgoVer==0, the first conjunction evaluates to true, making
2929 // the entire predicate true during compile time.
2930 // When AlgoVer==1, the second conjunction has only the second part to be
2931 // evaluated during runtime. Other conjunctions evaluates to false
2932 // during compile time.
2933 // When AlgoVer==2, the third conjunction has only the second part to be
2934 // evaluated during runtime. Other conjunctions evaluates to false
2935 // during compile time.
2936 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
2937 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2938 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
2939 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
2940 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
2941 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
2942 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
2943 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
2944 Value *RemoteOffsetComp =
2945 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
2946 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
2947 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
2948 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
2949
2950 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2951 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2952 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2953
2954 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
2957 ReduceList, Builder.getPtrTy());
2958 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2959 RemoteListAddrCast, Builder.getPtrTy());
2960 Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
2961 ->addFnAttr(Attribute::NoUnwind);
2962 Builder.CreateBr(MergeBB);
2963
2965 Builder.CreateBr(MergeBB);
2966
2968
2969 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2970 // Reduce list.
2971 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2972 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
2973 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
2974
2975 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
2976 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
2977 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
2978 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2979
2980 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
2981 emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
2982 ReductionInfos, RemoteListAddrCast, ReduceList);
2983 Builder.CreateBr(CpyMergeBB);
2984
2985 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
2986 Builder.CreateBr(CpyMergeBB);
2987
2988 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
2989
2991
2992 return SarFunc;
2993}
2994
2995Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
2996 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
2997 AttributeList FuncAttrs) {
2999 LLVMContext &Ctx = M.getContext();
3002 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3003 /* IsVarArg */ false);
3004 Function *LtGCFunc =
3006 "_omp_reduction_list_to_global_copy_func", &M);
3007 LtGCFunc->setAttributes(FuncAttrs);
3008 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3009 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3010 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3011
3012 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3013 Builder.SetInsertPoint(EntryBlock);
3014
3015 // Buffer: global reduction buffer.
3016 Argument *BufferArg = LtGCFunc->getArg(0);
3017 // Idx: index of the buffer.
3018 Argument *IdxArg = LtGCFunc->getArg(1);
3019 // ReduceList: thread local Reduce list.
3020 Argument *ReduceListArg = LtGCFunc->getArg(2);
3021
3022 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3023 BufferArg->getName() + ".addr");
3024 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3025 IdxArg->getName() + ".addr");
3026 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3027 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3029 BufferArgAlloca, Builder.getPtrTy(),
3030 BufferArgAlloca->getName() + ".ascast");
3032 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3033 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3034 ReduceListArgAlloca, Builder.getPtrTy(),
3035 ReduceListArgAlloca->getName() + ".ascast");
3036
3037 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3038 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3039 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3040
3041 Value *LocalReduceList =
3042 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3043 Value *BufferArgVal =
3044 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3045 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3046 Type *IndexTy = Builder.getIndexTy(
3048 for (auto En : enumerate(ReductionInfos)) {
3049 const ReductionInfo &RI = En.value();
3050 auto *RedListArrayTy =
3051 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3052 // Reduce element = LocalReduceList[i]
3053 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3054 RedListArrayTy, LocalReduceList,
3055 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3056 // elemptr = ((CopyType*)(elemptrptr)) + I
3057 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3058
3059 // Global = Buffer.VD[Idx];
3060 Value *BufferVD =
3061 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3063 ReductionsBufferTy, BufferVD, 0, En.index());
3064
3065 switch (RI.EvaluationKind) {
3066 case EvalKind::Scalar: {
3067 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3068 Builder.CreateStore(TargetElement, GlobVal);
3069 break;
3070 }
3071 case EvalKind::Complex: {
3073 RI.ElementType, ElemPtr, 0, 0, ".realp");
3074 Value *SrcReal = Builder.CreateLoad(
3075 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3077 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3078 Value *SrcImg = Builder.CreateLoad(
3079 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3080
3082 RI.ElementType, GlobVal, 0, 0, ".realp");
3084 RI.ElementType, GlobVal, 0, 1, ".imagp");
3085 Builder.CreateStore(SrcReal, DestRealPtr);
3086 Builder.CreateStore(SrcImg, DestImgPtr);
3087 break;
3088 }
3089 case EvalKind::Aggregate: {
3090 Value *SizeVal =
3091 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3093 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3094 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3095 break;
3096 }
3097 }
3098 }
3099
3101 Builder.restoreIP(OldIP);
3102 return LtGCFunc;
3103}
3104
3105Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
3106 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3107 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3109 LLVMContext &Ctx = M.getContext();
3112 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3113 /* IsVarArg */ false);
3114 Function *LtGRFunc =
3116 "_omp_reduction_list_to_global_reduce_func", &M);
3117 LtGRFunc->setAttributes(FuncAttrs);
3118 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3119 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3120 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3121
3122 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3123 Builder.SetInsertPoint(EntryBlock);
3124
3125 // Buffer: global reduction buffer.
3126 Argument *BufferArg = LtGRFunc->getArg(0);
3127 // Idx: index of the buffer.
3128 Argument *IdxArg = LtGRFunc->getArg(1);
3129 // ReduceList: thread local Reduce list.
3130 Argument *ReduceListArg = LtGRFunc->getArg(2);
3131
3132 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3133 BufferArg->getName() + ".addr");
3134 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3135 IdxArg->getName() + ".addr");
3136 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3137 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3138 auto *RedListArrayTy =
3139 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3140
3141 // 1. Build a list of reduction variables.
3142 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3143 Value *LocalReduceList =
3144 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3145
3147 BufferArgAlloca, Builder.getPtrTy(),
3148 BufferArgAlloca->getName() + ".ascast");
3150 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3151 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3152 ReduceListArgAlloca, Builder.getPtrTy(),
3153 ReduceListArgAlloca->getName() + ".ascast");
3154 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3155 LocalReduceList, Builder.getPtrTy(),
3156 LocalReduceList->getName() + ".ascast");
3157
3158 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3159 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3160 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3161
3162 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3163 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3164 Type *IndexTy = Builder.getIndexTy(
3166 for (auto En : enumerate(ReductionInfos)) {
3167 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3168 RedListArrayTy, LocalReduceListAddrCast,
3169 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3170 Value *BufferVD =
3171 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3172 // Global = Buffer.VD[Idx];
3174 ReductionsBufferTy, BufferVD, 0, En.index());
3175 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3176 }
3177
3178 // Call reduce_function(GlobalReduceList, ReduceList)
3179 Value *ReduceList =
3180 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3181 Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3182 ->addFnAttr(Attribute::NoUnwind);
3184 Builder.restoreIP(OldIP);
3185 return LtGRFunc;
3186}
3187
3188Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3189 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3190 AttributeList FuncAttrs) {
3192 LLVMContext &Ctx = M.getContext();
3195 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3196 /* IsVarArg */ false);
3197 Function *LtGCFunc =
3199 "_omp_reduction_global_to_list_copy_func", &M);
3200 LtGCFunc->setAttributes(FuncAttrs);
3201 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3202 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3203 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3204
3205 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3206 Builder.SetInsertPoint(EntryBlock);
3207
3208 // Buffer: global reduction buffer.
3209 Argument *BufferArg = LtGCFunc->getArg(0);
3210 // Idx: index of the buffer.
3211 Argument *IdxArg = LtGCFunc->getArg(1);
3212 // ReduceList: thread local Reduce list.
3213 Argument *ReduceListArg = LtGCFunc->getArg(2);
3214
3215 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3216 BufferArg->getName() + ".addr");
3217 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3218 IdxArg->getName() + ".addr");
3219 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3220 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3222 BufferArgAlloca, Builder.getPtrTy(),
3223 BufferArgAlloca->getName() + ".ascast");
3225 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3226 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3227 ReduceListArgAlloca, Builder.getPtrTy(),
3228 ReduceListArgAlloca->getName() + ".ascast");
3229 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3230 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3231 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3232
3233 Value *LocalReduceList =
3234 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3235 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3236 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3237 Type *IndexTy = Builder.getIndexTy(
3239 for (auto En : enumerate(ReductionInfos)) {
3240 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3241 auto *RedListArrayTy =
3242 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3243 // Reduce element = LocalReduceList[i]
3244 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3245 RedListArrayTy, LocalReduceList,
3246 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3247 // elemptr = ((CopyType*)(elemptrptr)) + I
3248 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3249 // Global = Buffer.VD[Idx];
3250 Value *BufferVD =
3251 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3253 ReductionsBufferTy, BufferVD, 0, En.index());
3254
3255 switch (RI.EvaluationKind) {
3256 case EvalKind::Scalar: {
3257 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3258 Builder.CreateStore(TargetElement, ElemPtr);
3259 break;
3260 }
3261 case EvalKind::Complex: {
3263 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3264 Value *SrcReal = Builder.CreateLoad(
3265 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3267 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3268 Value *SrcImg = Builder.CreateLoad(
3269 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3270
3272 RI.ElementType, ElemPtr, 0, 0, ".realp");
3274 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3275 Builder.CreateStore(SrcReal, DestRealPtr);
3276 Builder.CreateStore(SrcImg, DestImgPtr);
3277 break;
3278 }
3279 case EvalKind::Aggregate: {
3280 Value *SizeVal =
3284 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3285 SizeVal, false);
3286 break;
3287 }
3288 }
3289 }
3290
3292 Builder.restoreIP(OldIP);
3293 return LtGCFunc;
3294}
3295
3296Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3297 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3298 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3300 LLVMContext &Ctx = M.getContext();
3301 auto *FuncTy = FunctionType::get(
3303 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3304 /* IsVarArg */ false);
3305 Function *LtGRFunc =
3307 "_omp_reduction_global_to_list_reduce_func", &M);
3308 LtGRFunc->setAttributes(FuncAttrs);
3309 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3310 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3311 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3312
3313 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3314 Builder.SetInsertPoint(EntryBlock);
3315
3316 // Buffer: global reduction buffer.
3317 Argument *BufferArg = LtGRFunc->getArg(0);
3318 // Idx: index of the buffer.
3319 Argument *IdxArg = LtGRFunc->getArg(1);
3320 // ReduceList: thread local Reduce list.
3321 Argument *ReduceListArg = LtGRFunc->getArg(2);
3322
3323 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3324 BufferArg->getName() + ".addr");
3325 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3326 IdxArg->getName() + ".addr");
3327 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3328 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3329 ArrayType *RedListArrayTy =
3330 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3331
3332 // 1. Build a list of reduction variables.
3333 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3334 Value *LocalReduceList =
3335 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3336
3338 BufferArgAlloca, Builder.getPtrTy(),
3339 BufferArgAlloca->getName() + ".ascast");
3341 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3342 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3343 ReduceListArgAlloca, Builder.getPtrTy(),
3344 ReduceListArgAlloca->getName() + ".ascast");
3346 LocalReduceList, Builder.getPtrTy(),
3347 LocalReduceList->getName() + ".ascast");
3348
3349 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3350 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3351 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3352
3353 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3354 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3355 Type *IndexTy = Builder.getIndexTy(
3357 for (auto En : enumerate(ReductionInfos)) {
3358 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3359 RedListArrayTy, ReductionList,
3360 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3361 // Global = Buffer.VD[Idx];
3362 Value *BufferVD =
3363 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3365 ReductionsBufferTy, BufferVD, 0, En.index());
3366 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3367 }
3368
3369 // Call reduce_function(ReduceList, GlobalReduceList)
3370 Value *ReduceList =
3371 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3372 Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
3373 ->addFnAttr(Attribute::NoUnwind);
3375 Builder.restoreIP(OldIP);
3376 return LtGRFunc;
3377}
3378
3379std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3380 std::string Suffix =
3381 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3382 return (Name + Suffix).str();
3383}
3384
3385Expected<Function *> OpenMPIRBuilder::createReductionFunction(
3386 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3387 ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
3388 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3389 {Builder.getPtrTy(), Builder.getPtrTy()},
3390 /* IsVarArg */ false);
3391 std::string Name = getReductionFuncName(ReducerName);
3392 Function *ReductionFunc =
3394 ReductionFunc->setAttributes(FuncAttrs);
3395 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3396 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3397 BasicBlock *EntryBB =
3398 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3399 Builder.SetInsertPoint(EntryBB);
3400
3401 // Need to alloca memory here and deal with the pointers before getting
3402 // LHS/RHS pointers out
3403 Value *LHSArrayPtr = nullptr;
3404 Value *RHSArrayPtr = nullptr;
3405 Argument *Arg0 = ReductionFunc->getArg(0);
3406 Argument *Arg1 = ReductionFunc->getArg(1);
3407 Type *Arg0Type = Arg0->getType();
3408 Type *Arg1Type = Arg1->getType();
3409
3410 Value *LHSAlloca =
3411 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3412 Value *RHSAlloca =
3413 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3415 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3417 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3418 Builder.CreateStore(Arg0, LHSAddrCast);
3419 Builder.CreateStore(Arg1, RHSAddrCast);
3420 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3421 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3422
3423 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3424 Type *IndexTy = Builder.getIndexTy(
3426 SmallVector<Value *> LHSPtrs, RHSPtrs;
3427 for (auto En : enumerate(ReductionInfos)) {
3428 const ReductionInfo &RI = En.value();
3429 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3430 RedArrayTy, RHSArrayPtr,
3431 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3432 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3434 RHSI8Ptr, RI.PrivateVariable->getType(),
3435 RHSI8Ptr->getName() + ".ascast");
3436
3437 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3438 RedArrayTy, LHSArrayPtr,
3439 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3440 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3442 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3443
3445 LHSPtrs.emplace_back(LHSPtr);
3446 RHSPtrs.emplace_back(RHSPtr);
3447 } else {
3448 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3449 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3450 Value *Reduced;
3451 InsertPointOrErrorTy AfterIP =
3452 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3453 if (!AfterIP)
3454 return AfterIP.takeError();
3455 if (!Builder.GetInsertBlock())
3456 return ReductionFunc;
3457 Builder.CreateStore(Reduced, LHSPtr);
3458 }
3459 }
3460
3462 for (auto En : enumerate(ReductionInfos)) {
3463 unsigned Index = En.index();
3464 const ReductionInfo &RI = En.value();
3465 Value *LHSFixupPtr, *RHSFixupPtr;
3466 Builder.restoreIP(RI.ReductionGenClang(
3467 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3468
3469 // Fix the CallBack code genereated to use the correct Values for the LHS
3470 // and RHS
3471 LHSFixupPtr->replaceUsesWithIf(
3472 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3473 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3474 ReductionFunc;
3475 });
3476 RHSFixupPtr->replaceUsesWithIf(
3477 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3478 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3479 ReductionFunc;
3480 });
3481 }
3482
3484 return ReductionFunc;
3485}
3486
3487static void
3489 bool IsGPU) {
3490 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3491 (void)RI;
3492 assert(RI.Variable && "expected non-null variable");
3493 assert(RI.PrivateVariable && "expected non-null private variable");
3494 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3495 "expected non-null reduction generator callback");
3496 if (!IsGPU) {
3497 assert(
3498 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3499 "expected variables and their private equivalents to have the same "
3500 "type");
3501 }
3502 assert(RI.Variable->getType()->isPointerTy() &&
3503 "expected variables to be pointers");
3504 }
3505}
3506
3508 const LocationDescription &Loc, InsertPointTy AllocaIP,
3509 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3510 bool IsNoWait, bool IsTeamsReduction, bool HasDistribute,
3511 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
3512 unsigned ReductionBufNum, Value *SrcLocInfo) {
3513 if (!updateToLocation(Loc))
3514 return InsertPointTy();
3515 Builder.restoreIP(CodeGenIP);
3516 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3517 LLVMContext &Ctx = M.getContext();
3518
3519 // Source location for the ident struct
3520 if (!SrcLocInfo) {
3521 uint32_t SrcLocStrSize;
3522 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3523 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3524 }
3525
3526 if (ReductionInfos.size() == 0)
3527 return Builder.saveIP();
3528
3529 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3530 AttributeList FuncAttrs;
3531 AttrBuilder AttrBldr(Ctx);
3532 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3533 AttrBldr.addAttribute(Attr);
3534 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3535 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3536
3537 CodeGenIP = Builder.saveIP();
3538 Expected<Function *> ReductionResult =
3539 createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
3540 ReductionInfos, ReductionGenCBKind, FuncAttrs);
3541 if (!ReductionResult)
3542 return ReductionResult.takeError();
3543 Function *ReductionFunc = *ReductionResult;
3544 Builder.restoreIP(CodeGenIP);
3545
3546 // Set the grid value in the config needed for lowering later on
3547 if (GridValue.has_value())
3548 Config.setGridValue(GridValue.value());
3549 else
3550 Config.setGridValue(getGridValue(T, ReductionFunc));
3551
3552 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3553 // RedList, shuffle_reduce_func, interwarp_copy_func);
3554 // or
3555 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3556 Value *Res;
3557
3558 // 1. Build a list of reduction variables.
3559 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3560 auto Size = ReductionInfos.size();
3561 Type *PtrTy = PointerType::getUnqual(Ctx);
3562 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3563 CodeGenIP = Builder.saveIP();
3564 Builder.restoreIP(AllocaIP);
3565 Value *ReductionListAlloca =
3566 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3568 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3569 Builder.restoreIP(CodeGenIP);
3570 Type *IndexTy = Builder.getIndexTy(
3572 for (auto En : enumerate(ReductionInfos)) {
3573 const ReductionInfo &RI = En.value();
3574 Value *ElemPtr = Builder.CreateInBoundsGEP(
3575 RedArrayTy, ReductionList,
3576 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3577 Value *CastElem =
3579 Builder.CreateStore(CastElem, ElemPtr);
3580 }
3581 CodeGenIP = Builder.saveIP();
3582 Function *SarFunc =
3583 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
3584 Expected<Function *> CopyResult =
3585 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
3586 if (!CopyResult)
3587 return CopyResult.takeError();
3588 Function *WcFunc = *CopyResult;
3589 Builder.restoreIP(CodeGenIP);
3590
3591 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3592
3593 unsigned MaxDataSize = 0;
3594 SmallVector<Type *> ReductionTypeArgs;
3595 for (auto En : enumerate(ReductionInfos)) {
3596 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3597 if (Size > MaxDataSize)
3598 MaxDataSize = Size;
3599 ReductionTypeArgs.emplace_back(En.value().ElementType);
3600 }
3601 Value *ReductionDataSize =
3602 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3603 if (!IsTeamsReduction) {
3604 Value *SarFuncCast =
3606 Value *WcFuncCast =
3608 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
3609 WcFuncCast};
3611 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3612 Res = Builder.CreateCall(Pv2Ptr, Args);
3613 } else {
3614 CodeGenIP = Builder.saveIP();
3615 StructType *ReductionsBufferTy = StructType::create(
3616 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3617 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3618 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3619 Function *LtGCFunc = emitListToGlobalCopyFunction(
3620 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3621 Function *LtGRFunc = emitListToGlobalReduceFunction(
3622 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3623 Function *GtLCFunc = emitGlobalToListCopyFunction(
3624 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3625 Function *GtLRFunc = emitGlobalToListReduceFunction(
3626 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3627 Builder.restoreIP(CodeGenIP);
3628
3629 Value *KernelTeamsReductionPtr = Builder.CreateCall(
3630 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3631
3632 Value *Args3[] = {SrcLocInfo,
3633 KernelTeamsReductionPtr,
3634 Builder.getInt32(ReductionBufNum),
3635 ReductionDataSize,
3636 RL,
3637 SarFunc,
3638 WcFunc,
3639 LtGCFunc,
3640 LtGRFunc,
3641 GtLCFunc,
3642 GtLRFunc};
3643
3644 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3645 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3646 Res = Builder.CreateCall(TeamsReduceFn, Args3);
3647 }
3648
3649 // 5. Build if (res == 1)
3650 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3651 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3653 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3654
3655 // 6. Build then branch: where we have reduced values in the master
3656 // thread in each team.
3657 // __kmpc_end_reduce{_nowait}(<gtid>);
3658 // break;
3659 emitBlock(ThenBB, CurFunc);
3660
3661 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3662 for (auto En : enumerate(ReductionInfos)) {
3663 const ReductionInfo &RI = En.value();
3664 Value *LHS = RI.Variable;
3665 Value *RHS =
3667
3669 Value *LHSPtr, *RHSPtr;
3671 &LHSPtr, &RHSPtr, CurFunc));
3672
3673 // Fix the CallBack code genereated to use the correct Values for the LHS
3674 // and RHS
3675 LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
3676 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3677 ReductionFunc;
3678 });
3679 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3680 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3681 ReductionFunc;
3682 });
3683 } else {
3684 assert(false && "Unhandled ReductionGenCBKind");
3685 }
3686 }
3687 emitBlock(ExitBB, CurFunc);
3688
3690
3691 return Builder.saveIP();
3692}
3693
3695 Type *VoidTy = Type::getVoidTy(M.getContext());
3696 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3697 auto *FuncTy =
3698 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3700 ".omp.reduction.func", &M);
3701}
3702
3705 InsertPointTy AllocaIP,
3706 ArrayRef<ReductionInfo> ReductionInfos,
3707 ArrayRef<bool> IsByRef, bool IsNoWait) {
3708 assert(ReductionInfos.size() == IsByRef.size());
3709 for (const ReductionInfo &RI : ReductionInfos) {
3710 (void)RI;
3711 assert(RI.Variable && "expected non-null variable");
3712 assert(RI.PrivateVariable && "expected non-null private variable");
3713 assert(RI.ReductionGen && "expected non-null reduction generator callback");
3714 assert(RI.Variable->getType() == RI.PrivateVariable->getType() &&
3715 "expected variables and their private equivalents to have the same "
3716 "type");
3717 assert(RI.Variable->getType()->isPointerTy() &&
3718 "expected variables to be pointers");
3719 }
3720
3721 if (!updateToLocation(Loc))
3722 return InsertPointTy();
3723
3724 BasicBlock *InsertBlock = Loc.IP.getBlock();
3725 BasicBlock *ContinuationBlock =
3726 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3727 InsertBlock->getTerminator()->eraseFromParent();
3728
3729 // Create and populate array of type-erased pointers to private reduction
3730 // values.
3731 unsigned NumReductions = ReductionInfos.size();
3732 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3734 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
3735
3736 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3737
3738 for (auto En : enumerate(ReductionInfos)) {
3739 unsigned Index = En.index();
3740 const ReductionInfo &RI = En.value();
3741 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
3742 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
3743 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
3744 }
3745
3746 // Emit a call to the runtime function that orchestrates the reduction.
3747 // Declare the reduction function in the process.
3749 Module *Module = Func->getParent();
3750 uint32_t SrcLocStrSize;
3751 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3752 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
3753 return RI.AtomicReductionGen;
3754 });
3755 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
3756 CanGenerateAtomic
3757 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
3758 : IdentFlag(0));
3759 Value *ThreadId = getOrCreateThreadID(Ident);
3760 Constant *NumVariables = Builder.getInt32(NumReductions);
3761 const DataLayout &DL = Module->getDataLayout();
3762 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
3763 Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
3764 Function *ReductionFunc = getFreshReductionFunc(*Module);
3765 Value *Lock = getOMPCriticalRegionLock(".reduction");
3767 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
3768 : RuntimeFunction::OMPRTL___kmpc_reduce);
3769 CallInst *ReduceCall =
3770 Builder.CreateCall(ReduceFunc,
3771 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
3772 ReductionFunc, Lock},
3773 "reduce");
3774
3775 // Create final reduction entry blocks for the atomic and non-atomic case.
3776 // Emit IR that dispatches control flow to one of the blocks based on the
3777 // reduction supporting the atomic mode.
3778 BasicBlock *NonAtomicRedBlock =
3779 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
3780 BasicBlock *AtomicRedBlock =
3781 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
3782 SwitchInst *Switch =
3783 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
3784 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
3785 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
3786
3787 // Populate the non-atomic reduction using the elementwise reduction function.
3788 // This loads the elements from the global and private variables and reduces
3789 // them before storing back the result to the global variable.
3790 Builder.SetInsertPoint(NonAtomicRedBlock);
3791 for (auto En : enumerate(ReductionInfos)) {
3792 const ReductionInfo &RI = En.value();
3794 // We have one less load for by-ref case because that load is now inside of
3795 // the reduction region
3796 Value *RedValue = RI.Variable;
3797 if (!IsByRef[En.index()]) {
3798 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3799 "red.value." + Twine(En.index()));
3800 }
3801 Value *PrivateRedValue =
3803 "red.private.value." + Twine(En.index()));
3804 Value *Reduced;
3805 InsertPointOrErrorTy AfterIP =
3806 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
3807 if (!AfterIP)
3808 return AfterIP.takeError();
3809 Builder.restoreIP(*AfterIP);
3810
3811 if (!Builder.GetInsertBlock())
3812 return InsertPointTy();
3813 // for by-ref case, the load is inside of the reduction region
3814 if (!IsByRef[En.index()])
3815 Builder.CreateStore(Reduced, RI.Variable);
3816 }
3817 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
3818 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
3819 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
3820 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
3821 Builder.CreateBr(ContinuationBlock);
3822
3823 // Populate the atomic reduction using the atomic elementwise reduction
3824 // function. There are no loads/stores here because they will be happening
3825 // inside the atomic elementwise reduction.
3826 Builder.SetInsertPoint(AtomicRedBlock);
3827 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
3828 for (const ReductionInfo &RI : ReductionInfos) {
3831 if (!AfterIP)
3832 return AfterIP.takeError();
3833 Builder.restoreIP(*AfterIP);
3834 if (!Builder.GetInsertBlock())
3835 return InsertPointTy();
3836 }
3837 Builder.CreateBr(ContinuationBlock);
3838 } else {
3840 }
3841
3842 // Populate the outlined reduction function using the elementwise reduction
3843 // function. Partial values are extracted from the type-erased array of
3844 // pointers to private variables.
3845 BasicBlock *ReductionFuncBlock =
3846 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3847 Builder.SetInsertPoint(ReductionFuncBlock);
3848 Value *LHSArrayPtr = ReductionFunc->getArg(0);
3849 Value *RHSArrayPtr = ReductionFunc->getArg(1);
3850
3851 for (auto En : enumerate(ReductionInfos)) {
3852 const ReductionInfo &RI = En.value();
3854 RedArrayTy, LHSArrayPtr, 0, En.index());
3855 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3856 Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType());
3857 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3859 RedArrayTy, RHSArrayPtr, 0, En.index());
3860 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3861 Value *RHSPtr =
3863 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3864 Value *Reduced;
3865 InsertPointOrErrorTy AfterIP =
3866 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3867 if (!AfterIP)
3868 return AfterIP.takeError();
3869 Builder.restoreIP(*AfterIP);
3870 if (!Builder.GetInsertBlock())
3871 return InsertPointTy();
3872 // store is inside of the reduction region when using by-ref
3873 if (!IsByRef[En.index()])
3874 Builder.CreateStore(Reduced, LHSPtr);
3875 }
3877
3878 Builder.SetInsertPoint(ContinuationBlock);
3879 return Builder.saveIP();
3880}
3881
3884 BodyGenCallbackTy BodyGenCB,
3885 FinalizeCallbackTy FiniCB) {
3886 if (!updateToLocation(Loc))
3887 return Loc.IP;
3888
3889 Directive OMPD = Directive::OMPD_master;
3890 uint32_t SrcLocStrSize;
3891 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3892 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3893 Value *ThreadId = getOrCreateThreadID(Ident);
3894 Value *Args[] = {Ident, ThreadId};
3895
3896 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
3897 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3898
3899 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
3900 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
3901
3902 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3903 /*Conditional*/ true, /*hasFinalize*/ true);
3904}
3905
3908 BodyGenCallbackTy BodyGenCB,
3909 FinalizeCallbackTy FiniCB, Value *Filter) {
3910 if (!updateToLocation(Loc))
3911 return Loc.IP;
3912
3913 Directive OMPD = Directive::OMPD_masked;
3914 uint32_t SrcLocStrSize;
3915 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3916 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3917 Value *ThreadId = getOrCreateThreadID(Ident);
3918 Value *Args[] = {Ident, ThreadId, Filter};
3919 Value *ArgsEnd[] = {Ident, ThreadId};
3920
3921 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
3922 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3923
3924 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
3925 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
3926
3927 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3928 /*Conditional*/ true, /*hasFinalize*/ true);
3929}
3930
3932 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
3933 BasicBlock *PostInsertBefore, const Twine &Name) {
3934 Module *M = F->getParent();
3935 LLVMContext &Ctx = M->getContext();
3936 Type *IndVarTy = TripCount->getType();
3937
3938 // Create the basic block structure.
3939 BasicBlock *Preheader =
3940 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
3941 BasicBlock *Header =
3942 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
3943 BasicBlock *Cond =
3944 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
3945 BasicBlock *Body =
3946 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
3947 BasicBlock *Latch =
3948 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
3949 BasicBlock *Exit =
3950 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
3951 BasicBlock *After =
3952 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
3953
3954 // Use specified DebugLoc for new instructions.
3956
3957 Builder.SetInsertPoint(Preheader);
3958 Builder.CreateBr(Header);
3959
3960 Builder.SetInsertPoint(Header);
3961 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
3962 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
3964
3966 Value *Cmp =
3967 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
3968 Builder.CreateCondBr(Cmp, Body, Exit);
3969
3970 Builder.SetInsertPoint(Body);
3971 Builder.CreateBr(Latch);
3972
3973 Builder.SetInsertPoint(Latch);
3974 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
3975 "omp_" + Name + ".next", /*HasNUW=*/true);
3976 Builder.CreateBr(Header);
3977 IndVarPHI->addIncoming(Next, Latch);
3978
3979 Builder.SetInsertPoint(Exit);
3981
3982 // Remember and return the canonical control flow.
3983 LoopInfos.emplace_front();
3984 CanonicalLoopInfo *CL = &LoopInfos.front();
3985
3986 CL->Header = Header;
3987 CL->Cond = Cond;
3988 CL->Latch = Latch;
3989 CL->Exit = Exit;
3990
3991#ifndef NDEBUG
3992 CL->assertOK();
3993#endif
3994 return CL;
3995}
3996
3999 LoopBodyGenCallbackTy BodyGenCB,
4000 Value *TripCount, const Twine &Name) {
4001 BasicBlock *BB = Loc.IP.getBlock();
4002 BasicBlock *NextBB = BB->getNextNode();
4003
4004 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
4005 NextBB, NextBB, Name);
4006 BasicBlock *After = CL->getAfter();
4007
4008 // If location is not set, don't connect the loop.
4009 if (updateToLocation(Loc)) {
4010 // Split the loop at the insertion point: Branch to the preheader and move
4011 // every following instruction to after the loop (the After BB). Also, the
4012 // new successor is the loop's after block.
4013 spliceBB(Builder, After, /*CreateBranch=*/false);
4015 }
4016
4017 // Emit the body content. We do it after connecting the loop to the CFG to
4018 // avoid that the callback encounters degenerate BBs.
4019 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
4020 return Err;
4021
4022#ifndef NDEBUG
4023 CL->assertOK();
4024#endif
4025 return CL;
4026}
4027
4029 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4030 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4031 InsertPointTy ComputeIP, const Twine &Name) {
4032
4033 // Consider the following difficulties (assuming 8-bit signed integers):
4034 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
4035 // DO I = 1, 100, 50
4036 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
4037 // DO I = 100, 0, -128
4038
4039 // Start, Stop and Step must be of the same integer type.
4040 auto *IndVarTy = cast<IntegerType>(Start->getType());
4041 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
4042 assert(IndVarTy == Step->getType() && "Step type mismatch");
4043
4044 LocationDescription ComputeLoc =
4045 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4046 updateToLocation(ComputeLoc);
4047
4048 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
4049 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
4050
4051 // Like Step, but always positive.
4052 Value *Incr = Step;
4053
4054 // Distance between Start and Stop; always positive.
4055 Value *Span;
4056
4057 // Condition whether there are no iterations are executed at all, e.g. because
4058 // UB < LB.
4059 Value *ZeroCmp;
4060
4061 if (IsSigned) {
4062 // Ensure that increment is positive. If not, negate and invert LB and UB.
4063 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
4064 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
4065 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
4066 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
4067 Span = Builder.CreateSub(UB, LB, "", false, true);
4068 ZeroCmp = Builder.CreateICmp(
4069 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
4070 } else {
4071 Span = Builder.CreateSub(Stop, Start, "", true);
4072 ZeroCmp = Builder.CreateICmp(
4073 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
4074 }
4075
4076 Value *CountIfLooping;
4077 if (InclusiveStop) {
4078 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
4079 } else {
4080 // Avoid incrementing past stop since it could overflow.
4081 Value *CountIfTwo = Builder.CreateAdd(
4082 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
4083 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
4084 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
4085 }
4086 Value *TripCount = Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
4087 "omp_" + Name + ".tripcount");
4088
4089 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4090 Builder.restoreIP(CodeGenIP);
4091 Value *Span = Builder.CreateMul(IV, Step);
4092 Value *IndVar = Builder.CreateAdd(Span, Start);
4093 return BodyGenCB(Builder.saveIP(), IndVar);
4094 };
4095 LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP();
4096 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
4097}
4098
4099// Returns an LLVM function to call for initializing loop bounds using OpenMP
4100// static scheduling depending on `type`. Only i32 and i64 are supported by the
4101// runtime. Always interpret integers as unsigned similarly to
4102// CanonicalLoopInfo.
4104 OpenMPIRBuilder &OMPBuilder) {
4105 unsigned Bitwidth = Ty->getIntegerBitWidth();
4106 if (Bitwidth == 32)
4107 return OMPBuilder.getOrCreateRuntimeFunction(
4108 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
4109 if (Bitwidth == 64)
4110 return OMPBuilder.getOrCreateRuntimeFunction(
4111 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
4112 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4113}
4114
4116OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
4117 InsertPointTy AllocaIP,
4118 bool NeedsBarrier) {
4119 assert(CLI->isValid() && "Requires a valid canonical loop");
4120 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4121 "Require dedicated allocate IP");
4122
4123 // Set up the source location value for OpenMP runtime.
4126
4127 uint32_t SrcLocStrSize;
4128 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4129 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4130
4131 // Declare useful OpenMP runtime functions.
4132 Value *IV = CLI->getIndVar();
4133 Type *IVTy = IV->getType();
4134 FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this);
4135 FunctionCallee StaticFini =
4136 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4137
4138 // Allocate space for computed loop bounds as expected by the "init" function.
4139 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4140
4141 Type *I32Type = Type::getInt32Ty(M.getContext());
4142 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4143 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4144 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4145 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4146
4147 // At the end of the preheader, prepare for calling the "init" function by
4148 // storing the current loop bounds into the allocated space. A canonical loop
4149 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4150 // and produces an inclusive upper bound.
4152 Constant *Zero = ConstantInt::get(IVTy, 0);
4153 Constant *One = ConstantInt::get(IVTy, 1);
4154 Builder.CreateStore(Zero, PLowerBound);
4155 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
4156 Builder.CreateStore(UpperBound, PUpperBound);
4157 Builder.CreateStore(One, PStride);
4158
4159 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4160
4161 Constant *SchedulingType = ConstantInt::get(
4162 I32Type, static_cast<int>(OMPScheduleType::UnorderedStatic));
4163
4164 // Call the "init" function and update the trip count of the loop with the
4165 // value it produced.
4166 Builder.CreateCall(StaticInit,
4167 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
4168 PUpperBound, PStride, One, Zero});
4169 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
4170 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
4171 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
4172 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
4173 CLI->setTripCount(TripCount);
4174
4175 // Update all uses of the induction variable except the one in the condition
4176 // block that compares it with the actual upper bound, and the increment in
4177 // the latch block.
4178
4179 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
4181 CLI->getBody()->getFirstInsertionPt());
4183 return Builder.CreateAdd(OldIV, LowerBound);
4184 });
4185
4186 // In the "exit" block, call the "fini" function.
4188 CLI->getExit()->getTerminator()->getIterator());
4189 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4190
4191 // Add the barrier if requested.
4192 if (NeedsBarrier) {
4193 InsertPointOrErrorTy BarrierIP =
4194 createBarrier(LocationDescription(Builder.saveIP(), DL),
4195 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4196 /* CheckCancelFlag */ false);
4197 if (!BarrierIP)
4198 return BarrierIP.takeError();
4199 }
4200
4201 InsertPointTy AfterIP = CLI->getAfterIP();
4202 CLI->invalidate();
4203
4204 return AfterIP;
4205}
4206
4208OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL,
4209 CanonicalLoopInfo *CLI,
4210 InsertPointTy AllocaIP,
4211 bool NeedsBarrier,
4212 Value *ChunkSize) {
4213 assert(CLI->isValid() && "Requires a valid canonical loop");
4214 assert(ChunkSize && "Chunk size is required");
4215
4216 LLVMContext &Ctx = CLI->getFunction()->getContext();
4217 Value *IV = CLI->getIndVar();
4218 Value *OrigTripCount = CLI->getTripCount();
4219 Type *IVTy = IV->getType();
4220 assert(IVTy->getIntegerBitWidth() <= 64 &&
4221 "Max supported tripcount bitwidth is 64 bits");
4222 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4223 : Type::getInt64Ty(Ctx);
4224 Type *I32Type = Type::getInt32Ty(M.getContext());
4225 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4226 Constant *One = ConstantInt::get(InternalIVTy, 1);
4227
4228 // Declare useful OpenMP runtime functions.
4229 FunctionCallee StaticInit =
4230 getKmpcForStaticInitForType(InternalIVTy, M, *this);
4231 FunctionCallee StaticFini =
4232 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4233
4234 // Allocate space for computed loop bounds as expected by the "init" function.
4235 Builder.restoreIP(AllocaIP);
4237 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4238 Value *PLowerBound =
4239 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
4240 Value *PUpperBound =
4241 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
4242 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
4243
4244 // Set up the source location value for the OpenMP runtime.
4247
4248 // TODO: Detect overflow in ubsan or max-out with current tripcount.
4249 Value *CastedChunkSize =
4250 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
4251 Value *CastedTripCount =
4252 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
4253
4254 Constant *SchedulingType = ConstantInt::get(
4255 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
4256 Builder.CreateStore(Zero, PLowerBound);
4257 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
4258 Builder.CreateStore(OrigUpperBound, PUpperBound);
4259 Builder.CreateStore(One, PStride);
4260
4261 // Call the "init" function and update the trip count of the loop with the
4262 // value it produced.
4263 uint32_t SrcLocStrSize;
4264 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4265 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4266 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4267 Builder.CreateCall(StaticInit,
4268 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
4269 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
4270 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
4271 /*pstride=*/PStride, /*incr=*/One,
4272 /*chunk=*/CastedChunkSize});
4273
4274 // Load values written by the "init" function.
4275 Value *FirstChunkStart =
4276 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
4277 Value *FirstChunkStop =
4278 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
4279 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
4280 Value *ChunkRange =
4281 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
4282 Value *NextChunkStride =
4283 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
4284
4285 // Create outer "dispatch" loop for enumerating the chunks.
4286 BasicBlock *DispatchEnter = splitBB(Builder, true);
4287 Value *DispatchCounter;
4288
4289 // It is safe to assume this didn't return an error because the callback
4290 // passed into createCanonicalLoop is the only possible error source, and it
4291 // always returns success.
4293 {Builder.saveIP(), DL},
4294 [&](InsertPointTy BodyIP, Value *Counter) {
4295 DispatchCounter = Counter;
4296 return Error::success();
4297 },
4298 FirstChunkStart, CastedTripCount, NextChunkStride,
4299 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
4300 "dispatch"));
4301
4302 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
4303 // not have to preserve the canonical invariant.
4304 BasicBlock *DispatchBody = DispatchCLI->getBody();
4305 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
4306 BasicBlock *DispatchExit = DispatchCLI->getExit();
4307 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
4308 DispatchCLI->invalidate();
4309
4310 // Rewire the original loop to become the chunk loop inside the dispatch loop.
4311 redirectTo(DispatchAfter, CLI->getAfter(), DL);
4312 redirectTo(CLI->getExit(), DispatchLatch, DL);
4313 redirectTo(DispatchBody, DispatchEnter, DL);
4314
4315 // Prepare the prolog of the chunk loop.
4318
4319 // Compute the number of iterations of the chunk loop.
4321 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
4322 Value *IsLastChunk =
4323 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
4324 Value *CountUntilOrigTripCount =
4325 Builder.CreateSub(CastedTripCount, DispatchCounter);
4326 Value *ChunkTripCount = Builder.CreateSelect(
4327 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
4328 Value *BackcastedChunkTC =
4329 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
4330 CLI->setTripCount(BackcastedChunkTC);
4331
4332 // Update all uses of the induction variable except the one in the condition
4333 // block that compares it with the actual upper bound, and the increment in
4334 // the latch block.
4335 Value *BackcastedDispatchCounter =
4336 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
4337 CLI->mapIndVar([&](Instruction *) -> Value * {
4338 Builder.restoreIP(CLI->getBodyIP());
4339 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
4340 });
4341
4342 // In the "exit" block, call the "fini" function.
4343 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
4344 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4345
4346 // Add the barrier if requested.
4347 if (NeedsBarrier) {
4348 InsertPointOrErrorTy AfterIP =
4349 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
4350 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
4351 if (!AfterIP)
4352 return AfterIP.takeError();
4353 }
4354
4355#ifndef NDEBUG
4356 // Even though we currently do not support applying additional methods to it,
4357 // the chunk loop should remain a canonical loop.
4358 CLI->assertOK();
4359#endif
4360
4361 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
4362}
4363
4364// Returns an LLVM function to call for executing an OpenMP static worksharing
4365// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
4366// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
4367static FunctionCallee
4369 WorksharingLoopType LoopType) {
4370 unsigned Bitwidth = Ty->getIntegerBitWidth();
4371 Module &M = OMPBuilder->M;
4372 switch (LoopType) {
4373 case WorksharingLoopType::ForStaticLoop:
4374 if (Bitwidth == 32)
4375 return OMPBuilder->getOrCreateRuntimeFunction(
4376 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
4377 if (Bitwidth == 64)
4378 return OMPBuilder->getOrCreateRuntimeFunction(
4379 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
4380 break;
4381 case WorksharingLoopType::DistributeStaticLoop:
4382 if (Bitwidth == 32)
4383 return OMPBuilder->getOrCreateRuntimeFunction(
4384 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
4385 if (Bitwidth == 64)
4386 return OMPBuilder->getOrCreateRuntimeFunction(
4387 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
4388 break;
4389 case WorksharingLoopType::DistributeForStaticLoop:
4390 if (Bitwidth == 32)
4391 return OMPBuilder->getOrCreateRuntimeFunction(
4392 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
4393 if (Bitwidth == 64)
4394 return OMPBuilder->getOrCreateRuntimeFunction(
4395 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
4396 break;
4397 }
4398 if (Bitwidth != 32 && Bitwidth != 64) {
4399 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
4400 }
4401 llvm_unreachable("Unknown type of OpenMP worksharing loop");
4402}
4403
4404// Inserts a call to proper OpenMP Device RTL function which handles
4405// loop worksharing.
4407 OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
4408 BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
4409 Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
4410 Type *TripCountTy = TripCount->getType();
4411 Module &M = OMPBuilder->M;
4412 IRBuilder<> &Builder = OMPBuilder->Builder;
4413 FunctionCallee RTLFn =
4414 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
4415 SmallVector<Value *, 8> RealArgs;
4416 RealArgs.push_back(Ident);
4417 RealArgs.push_back(Builder.CreateBitCast(&LoopBodyFn, ParallelTaskPtr));
4418 RealArgs.push_back(LoopBodyArg);
4419 RealArgs.push_back(TripCount);
4420 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
4421 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4422 Builder.CreateCall(RTLFn, RealArgs);
4423 return;
4424 }
4425 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
4426 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
4427 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
4428 Value *NumThreads = Builder.CreateCall(RTLNumThreads, {});
4429
4430 RealArgs.push_back(
4431 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
4432 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4433 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4434 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4435 }
4436
4437 Builder.CreateCall(RTLFn, RealArgs);
4438}
4439
4440static void
4442 CanonicalLoopInfo *CLI, Value *Ident,
4443 Function &OutlinedFn, Type *ParallelTaskPtr,
4444 const SmallVector<Instruction *, 4> &ToBeDeleted,
4445 WorksharingLoopType LoopType) {
4446 IRBuilder<> &Builder = OMPIRBuilder->Builder;
4447 BasicBlock *Preheader = CLI->getPreheader();
4448 Value *TripCount = CLI->getTripCount();
4449
4450 // After loop body outling, the loop body contains only set up
4451 // of loop body argument structure and the call to the outlined
4452 // loop body function. Firstly, we need to move setup of loop body args
4453 // into loop preheader.
4454 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
4455 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
4456
4457 // The next step is to remove the whole loop. We do not it need anymore.
4458 // That's why make an unconditional branch from loop preheader to loop
4459 // exit block
4460 Builder.restoreIP({Preheader, Preheader->end()});
4461 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
4462 Preheader->getTerminator()->eraseFromParent();
4463 Builder.CreateBr(CLI->getExit());
4464
4465 // Delete dead loop blocks
4466 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
4467 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
4468 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
4469 CleanUpInfo.EntryBB = CLI->getHeader();
4470 CleanUpInfo.ExitBB = CLI->getExit();
4471 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
4472 DeleteDeadBlocks(BlocksToBeRemoved);
4473
4474 // Find the instruction which corresponds to loop body argument structure
4475 // and remove the call to loop body function instruction.
4476 Value *LoopBodyArg;
4477 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
4478 assert(OutlinedFnUser &&
4479 "Expected unique undroppable user of outlined function");
4480 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
4481 assert(OutlinedFnCallInstruction && "Expected outlined function call");
4482 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
4483 "Expected outlined function call to be located in loop preheader");
4484 // Check in case no argument structure has been passed.
4485 if (OutlinedFnCallInstruction->arg_size() > 1)
4486 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
4487 else
4488 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
4489 OutlinedFnCallInstruction->eraseFromParent();
4490
4491 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
4492 LoopBodyArg, ParallelTaskPtr, TripCount,
4493 OutlinedFn);
4494
4495 for (auto &ToBeDeletedItem : ToBeDeleted)
4496 ToBeDeletedItem->eraseFromParent();
4497 CLI->invalidate();
4498}
4499
4501OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
4502 InsertPointTy AllocaIP,
4503 WorksharingLoopType LoopType) {
4504 uint32_t SrcLocStrSize;
4505 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4506 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4507
4508 OutlineInfo OI;
4509 OI.OuterAllocaBB = CLI->getPreheader();
4510 Function *OuterFn = CLI->getPreheader()->getParent();
4511
4512 // Instructions which need to be deleted at the end of code generation
4514
4515 OI.OuterAllocaBB = AllocaIP.getBlock();
4516
4517 // Mark the body loop as region which needs to be extracted
4518 OI.EntryBB = CLI->getBody();
4519 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
4520 "omp.prelatch", true);
4521
4522 // Prepare loop body for extraction
4523 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
4524
4525 // Insert new loop counter variable which will be used only in loop
4526 // body.
4527 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
4528 Instruction *NewLoopCntLoad =
4529 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
4530 // New loop counter instructions are redundant in the loop preheader when
4531 // code generation for workshare loop is finshed. That's why mark them as
4532 // ready for deletion.
4533 ToBeDeleted.push_back(NewLoopCntLoad);
4534 ToBeDeleted.push_back(NewLoopCnt);
4535
4536 // Analyse loop body region. Find all input variables which are used inside
4537 // loop body region.
4538 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
4540 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
4541 SmallVector<BasicBlock *, 32> BlocksT(ParallelRegionBlockSet.begin(),
4542 ParallelRegionBlockSet.end());
4543
4544 CodeExtractorAnalysisCache CEAC(*OuterFn);
4545 CodeExtractor Extractor(Blocks,
4546 /* DominatorTree */ nullptr,
4547 /* AggregateArgs */ true,
4548 /* BlockFrequencyInfo */ nullptr,
4549 /* BranchProbabilityInfo */ nullptr,
4550 /* AssumptionCache */ nullptr,
4551 /* AllowVarArgs */ true,
4552 /* AllowAlloca */ true,
4553 /* AllocationBlock */ CLI->getPreheader(),
4554 /* Suffix */ ".omp_wsloop",
4555 /* AggrArgsIn0AddrSpace */ true);
4556
4557 BasicBlock *CommonExit = nullptr;
4558 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
4559
4560 // Find allocas outside the loop body region which are used inside loop
4561 // body
4562 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
4563
4564 // We need to model loop body region as the function f(cnt, loop_arg).
4565 // That's why we replace loop induction variable by the new counter
4566 // which will be one of loop body function argument
4568 CLI->getIndVar()->user_end());
4569 for (auto Use : Users) {
4570 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
4571 if (ParallelRegionBlockSet.count(Inst->getParent())) {
4572 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
4573 }
4574 }
4575 }
4576 // Make sure that loop counter variable is not merged into loop body
4577 // function argument structure and it is passed as separate variable
4578 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
4579
4580 // PostOutline CB is invoked when loop body function is outlined and
4581 // loop body is replaced by call to outlined function. We need to add
4582 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
4583 // function will handle loop control logic.
4584 //
4585 OI.PostOutlineCB = [=, ToBeDeletedVec =
4586 std::move(ToBeDeleted)](Function &OutlinedFn) {
4587 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ParallelTaskPtr,
4588 ToBeDeletedVec, LoopType);
4589 };
4590 addOutlineInfo(std::move(OI));
4591 return CLI->getAfterIP();
4592}
4593
4596 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
4597 bool HasSimdModifier, bool HasMonotonicModifier,
4598 bool HasNonmonotonicModifier, bool HasOrderedClause,
4599 WorksharingLoopType LoopType) {
4600 if (Config.isTargetDevice())
4601 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
4602 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
4603 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
4604 HasNonmonotonicModifier, HasOrderedClause);
4605
4606 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
4607 OMPScheduleType::ModifierOrdered;
4608 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
4609 case OMPScheduleType::BaseStatic:
4610 assert(!ChunkSize && "No chunk size with static-chunked schedule");
4611 if (IsOrdered)
4612 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4613 NeedsBarrier, ChunkSize);
4614 // FIXME: Monotonicity ignored?
4615 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier);
4616
4617 case OMPScheduleType::BaseStaticChunked:
4618 if (IsOrdered)
4619 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4620 NeedsBarrier, ChunkSize);
4621 // FIXME: Monotonicity ignored?
4622 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
4623 ChunkSize);
4624
4625 case OMPScheduleType::BaseRuntime:
4626 case OMPScheduleType::BaseAuto:
4627 case OMPScheduleType::BaseGreedy:
4628 case OMPScheduleType::BaseBalanced:
4629 case OMPScheduleType::BaseSteal:
4630 case OMPScheduleType::BaseGuidedSimd:
4631 case OMPScheduleType::BaseRuntimeSimd:
4632 assert(!ChunkSize &&
4633 "schedule type does not support user-defined chunk sizes");
4634 [[fallthrough]];
4635 case OMPScheduleType::BaseDynamicChunked:
4636 case OMPScheduleType::BaseGuidedChunked:
4637 case OMPScheduleType::BaseGuidedIterativeChunked:
4638 case OMPScheduleType::BaseGuidedAnalyticalChunked:
4639 case OMPScheduleType::BaseStaticBalancedChunked:
4640 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4641 NeedsBarrier, ChunkSize);
4642
4643 default:
4644 llvm_unreachable("Unknown/unimplemented schedule kind");
4645 }
4646}
4647
4648/// Returns an LLVM function to call for initializing loop bounds using OpenMP
4649/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
4650/// the runtime. Always interpret integers as unsigned similarly to
4651/// CanonicalLoopInfo.
4652static FunctionCallee
4654 unsigned Bitwidth = Ty->getIntegerBitWidth();
4655 if (Bitwidth == 32)
4656 return OMPBuilder.getOrCreateRuntimeFunction(
4657 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
4658 if (Bitwidth == 64)
4659 return OMPBuilder.getOrCreateRuntimeFunction(
4660 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
4661 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4662}
4663
4664/// Returns an LLVM function to call for updating the next loop using OpenMP
4665/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
4666/// the runtime. Always interpret integers as unsigned similarly to
4667/// CanonicalLoopInfo.
4668static FunctionCallee
4670 unsigned Bitwidth = Ty->getIntegerBitWidth();
4671 if (Bitwidth == 32)
4672 return OMPBuilder.getOrCreateRuntimeFunction(
4673 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
4674 if (Bitwidth == 64)
4675 return OMPBuilder.getOrCreateRuntimeFunction(
4676 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
4677 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4678}
4679
4680/// Returns an LLVM function to call for finalizing the dynamic loop using
4681/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
4682/// interpret integers as unsigned similarly to CanonicalLoopInfo.
4683static FunctionCallee
4685 unsigned Bitwidth = Ty->getIntegerBitWidth();
4686 if (Bitwidth == 32)
4687 return OMPBuilder.getOrCreateRuntimeFunction(
4688 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
4689 if (Bitwidth == 64)
4690 return OMPBuilder.getOrCreateRuntimeFunction(
4691 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
4692 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4693}
4694
4696OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
4697 InsertPointTy AllocaIP,
4698 OMPScheduleType SchedType,
4699 bool NeedsBarrier, Value *Chunk) {
4700 assert(CLI->isValid() && "Requires a valid canonical loop");
4701 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4702 "Require dedicated allocate IP");
4704 "Require valid schedule type");
4705
4706 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
4707 OMPScheduleType::ModifierOrdered;
4708
4709 // Set up the source location value for OpenMP runtime.
4711
4712 uint32_t SrcLocStrSize;
4713 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4714 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4715
4716 // Declare useful OpenMP runtime functions.
4717 Value *IV = CLI->getIndVar();
4718 Type *IVTy = IV->getType();
4719 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
4720 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
4721
4722 // Allocate space for computed loop bounds as expected by the "init" function.
4723 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4724 Type *I32Type = Type::getInt32Ty(M.getContext());
4725 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4726 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4727 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4728 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4729
4730 // At the end of the preheader, prepare for calling the "init" function by
4731 // storing the current loop bounds into the allocated space. A canonical loop
4732 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4733 // and produces an inclusive upper bound.
4734 BasicBlock *PreHeader = CLI->getPreheader();
4735 Builder.SetInsertPoint(PreHeader->getTerminator());
4736 Constant *One = ConstantInt::get(IVTy, 1);
4737 Builder.CreateStore(One, PLowerBound);
4738 Value *UpperBound = CLI->getTripCount();
4739 Builder.CreateStore(UpperBound, PUpperBound);
4740 Builder.CreateStore(One, PStride);
4741
4742 BasicBlock *Header = CLI->getHeader();
4743 BasicBlock *Exit = CLI->getExit();
4744 BasicBlock *Cond = CLI->getCond();
4745 BasicBlock *Latch = CLI->getLatch();
4746 InsertPointTy AfterIP = CLI->getAfterIP();
4747
4748 // The CLI will be "broken" in the code below, as the loop is no longer
4749 // a valid canonical loop.
4750
4751 if (!Chunk)
4752 Chunk = One;
4753
4754 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4755
4756 Constant *SchedulingType =
4757 ConstantInt::get(I32Type, static_cast<int>(SchedType));
4758
4759 // Call the "init" function.
4760 Builder.CreateCall(DynamicInit,
4761 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
4762 UpperBound, /* step */ One, Chunk});
4763
4764 // An outer loop around the existing one.
4765 BasicBlock *OuterCond = BasicBlock::Create(
4766 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
4767 PreHeader->getParent());
4768 // This needs to be 32-bit always, so can't use the IVTy Zero above.
4769 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
4770 Value *Res =
4771 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
4772 PLowerBound, PUpperBound, PStride});
4773 Constant *Zero32 = ConstantInt::get(I32Type, 0);
4774 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
4775 Value *LowerBound =
4776 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
4777 Builder.CreateCondBr(MoreWork, Header, Exit);
4778
4779 // Change PHI-node in loop header to use outer cond rather than preheader,
4780 // and set IV to the LowerBound.
4781 Instruction *Phi = &Header->front();
4782 auto *PI = cast<PHINode>(Phi);
4783 PI->setIncomingBlock(0, OuterCond);
4784 PI->setIncomingValue(0, LowerBound);
4785
4786 // Then set the pre-header to jump to the OuterCond
4787 Instruction *Term = PreHeader->getTerminator();
4788 auto *Br = cast<BranchInst>(Term);
4789 Br->setSuccessor(0, OuterCond);
4790
4791 // Modify the inner condition:
4792 // * Use the UpperBound returned from the DynamicNext call.
4793 // * jump to the loop outer loop when done with one of the inner loops.
4794 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
4795 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
4797 auto *CI = cast<CmpInst>(Comp);
4798 CI->setOperand(1, UpperBound);
4799 // Redirect the inner exit to branch to outer condition.
4800 Instruction *Branch = &Cond->back();
4801 auto *BI = cast<BranchInst>(Branch);
4802 assert(BI->getSuccessor(1) == Exit);
4803 BI->setSuccessor(1, OuterCond);
4804
4805 // Call the "fini" function if "ordered" is present in wsloop directive.
4806 if (Ordered) {
4807 Builder.SetInsertPoint(&Latch->back());
4808 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
4809 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
4810 }
4811
4812 // Add the barrier if requested.
4813 if (NeedsBarrier) {
4814 Builder.SetInsertPoint(&Exit->back());
4815 InsertPointOrErrorTy BarrierIP =
4816 createBarrier(LocationDescription(Builder.saveIP(), DL),
4817 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4818 /* CheckCancelFlag */ false);
4819 if (!BarrierIP)
4820 return BarrierIP.takeError();
4821 }
4822
4823 CLI->invalidate();
4824 return AfterIP;
4825}
4826
4827/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
4828/// after this \p OldTarget will be orphaned.
4830 BasicBlock *NewTarget, DebugLoc DL) {
4831 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
4832 redirectTo(Pred, NewTarget, DL);
4833}
4834
4835/// Determine which blocks in \p BBs are reachable from outside and remove the
4836/// ones that are not reachable from the function.
4838 SmallPtrSet<BasicBlock *, 6> BBsToErase{BBs.begin(), BBs.end()};
4839 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
4840 for (Use &U : BB->uses()) {
4841 auto *UseInst = dyn_cast<Instruction>(U.getUser());
4842 if (!UseInst)
4843 continue;
4844 if (BBsToErase.count(UseInst->getParent()))
4845 continue;
4846 return true;
4847 }
4848 return false;
4849 };
4850
4851 while (BBsToErase.remove_if(HasRemainingUses)) {
4852 // Try again if anything was removed.
4853 }
4854
4855 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
4856 DeleteDeadBlocks(BBVec);
4857}
4858
4861 InsertPointTy ComputeIP) {
4862 assert(Loops.size() >= 1 && "At least one loop required");
4863 size_t NumLoops = Loops.size();
4864
4865 // Nothing to do if there is already just one loop.
4866 if (NumLoops == 1)
4867 return Loops.front();
4868
4869 CanonicalLoopInfo *Outermost = Loops.front();
4870 CanonicalLoopInfo *Innermost = Loops.back();
4871 BasicBlock *OrigPreheader = Outermost->getPreheader();
4872 BasicBlock *OrigAfter = Outermost->getAfter();
4873 Function *F = OrigPreheader->getParent();
4874
4875 // Loop control blocks that may become orphaned later.
4876 SmallVector<BasicBlock *, 12> OldControlBBs;
4877 OldControlBBs.reserve(6 * Loops.size());
4879 Loop->collectControlBlocks(OldControlBBs);
4880
4881 // Setup the IRBuilder for inserting the trip count computation.
4883 if (ComputeIP.isSet())
4884 Builder.restoreIP(ComputeIP);
4885 else
4886 Builder.restoreIP(Outermost->getPreheaderIP());
4887
4888 // Derive the collapsed' loop trip count.
4889 // TODO: Find common/largest indvar type.
4890 Value *CollapsedTripCount = nullptr;
4891 for (CanonicalLoopInfo *L : Loops) {
4892 assert(L->isValid() &&
4893 "All loops to collapse must be valid canonical loops");
4894 Value *OrigTripCount = L->getTripCount();
4895 if (!CollapsedTripCount) {
4896 CollapsedTripCount = OrigTripCount;
4897 continue;
4898 }
4899
4900 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
4901 CollapsedTripCount = Builder.CreateMul(CollapsedTripCount, OrigTripCount,
4902 {}, /*HasNUW=*/true);
4903 }
4904
4905 // Create the collapsed loop control flow.
4906 CanonicalLoopInfo *Result =
4907 createLoopSkeleton(DL, CollapsedTripCount, F,
4908 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
4909
4910 // Build the collapsed loop body code.
4911 // Start with deriving the input loop induction variables from the collapsed
4912 // one, using a divmod scheme. To preserve the original loops' order, the
4913 // innermost loop use the least significant bits.
4914 Builder.restoreIP(Result->getBodyIP());
4915
4916 Value *Leftover = Result->getIndVar();
4917 SmallVector<Value *> NewIndVars;
4918 NewIndVars.resize(NumLoops);
4919 for (int i = NumLoops - 1; i >= 1; --i) {
4920 Value *OrigTripCount = Loops[i]->getTripCount();
4921
4922 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
4923 NewIndVars[i] = NewIndVar;
4924
4925 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
4926 }
4927 // Outermost loop gets all the remaining bits.
4928 NewIndVars[0] = Leftover;
4929
4930 // Construct the loop body control flow.
4931 // We progressively construct the branch structure following in direction of
4932 // the control flow, from the leading in-between code, the loop nest body, the
4933 // trailing in-between code, and rejoining the collapsed loop's latch.
4934 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
4935 // the ContinueBlock is set, continue with that block. If ContinuePred, use
4936 // its predecessors as sources.
4937 BasicBlock *ContinueBlock = Result->getBody();
4938 BasicBlock *ContinuePred = nullptr;
4939 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
4940 BasicBlock *NextSrc) {
4941 if (ContinueBlock)
4942 redirectTo(ContinueBlock, Dest, DL);
4943 else
4944 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
4945
4946 ContinueBlock = nullptr;
4947 ContinuePred = NextSrc;
4948 };
4949
4950 // The code before the nested loop of each level.
4951 // Because we are sinking it into the nest, it will be executed more often
4952 // that the original loop. More sophisticated schemes could keep track of what
4953 // the in-between code is and instantiate it only once per thread.
4954 for (size_t i = 0; i < NumLoops - 1; ++i)
4955 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
4956
4957 // Connect the loop nest body.
4958 ContinueWith(Innermost->getBody(), Innermost->getLatch());
4959
4960 // The code after the nested loop at each level.
4961 for (size_t i = NumLoops - 1; i > 0; --i)
4962 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
4963
4964 // Connect the finished loop to the collapsed loop latch.
4965 ContinueWith(Result->getLatch(), nullptr);
4966
4967 // Replace the input loops with the new collapsed loop.
4968 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
4969 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
4970
4971 // Replace the input loop indvars with the derived ones.
4972 for (size_t i = 0; i < NumLoops; ++i)
4973 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
4974
4975 // Remove unused parts of the input loops.
4976 removeUnusedBlocksFromParent(OldControlBBs);
4977
4978 for (CanonicalLoopInfo *L : Loops)
4979 L->invalidate();
4980
4981#ifndef NDEBUG
4982 Result->assertOK();
4983#endif
4984 return Result;
4985}
4986
4987std::vector<CanonicalLoopInfo *>
4989 ArrayRef<Value *> TileSizes) {
4990 assert(TileSizes.size() == Loops.size() &&
4991 "Must pass as many tile sizes as there are loops");
4992 int NumLoops = Loops.size();
4993 assert(NumLoops >= 1 && "At least one loop to tile required");
4994
4995 CanonicalLoopInfo *OutermostLoop = Loops.front();
4996 CanonicalLoopInfo *InnermostLoop = Loops.back();
4997 Function *F = OutermostLoop->getBody()->getParent();
4998 BasicBlock *InnerEnter = InnermostLoop->getBody();
4999 BasicBlock *InnerLatch = InnermostLoop->getLatch();
5000
5001 // Loop control blocks that may become orphaned later.
5002 SmallVector<BasicBlock *, 12> OldControlBBs;
5003 OldControlBBs.reserve(6 * Loops.size());
5005 Loop->collectControlBlocks(OldControlBBs);
5006
5007 // Collect original trip counts and induction variable to be accessible by
5008 // index. Also, the structure of the original loops is not preserved during
5009 // the construction of the tiled loops, so do it before we scavenge the BBs of
5010 // any original CanonicalLoopInfo.
5011 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
5012 for (CanonicalLoopInfo *L : Loops) {
5013 assert(L->isValid() && "All input loops must be valid canonical loops");
5014 OrigTripCounts.push_back(L->getTripCount());
5015 OrigIndVars.push_back(L->getIndVar());
5016 }
5017
5018 // Collect the code between loop headers. These may contain SSA definitions
5019 // that are used in the loop nest body. To be usable with in the innermost
5020 // body, these BasicBlocks will be sunk into the loop nest body. That is,
5021 // these instructions may be executed more often than before the tiling.
5022 // TODO: It would be sufficient to only sink them into body of the
5023 // corresponding tile loop.
5025 for (int i = 0; i < NumLoops - 1; ++i) {
5026 CanonicalLoopInfo *Surrounding = Loops[i];
5027 CanonicalLoopInfo *Nested = Loops[i + 1];
5028
5029 BasicBlock *EnterBB = Surrounding->getBody();
5030 BasicBlock *ExitBB = Nested->getHeader();
5031 InbetweenCode.emplace_back(EnterBB, ExitBB);
5032 }
5033
5034 // Compute the trip counts of the floor loops.
5036 Builder.restoreIP(OutermostLoop->getPreheaderIP());
5037 SmallVector<Value *, 4> FloorCount, FloorRems;
5038 for (int i = 0; i < NumLoops; ++i) {
5039 Value *TileSize = TileSizes[i];
5040 Value *OrigTripCount = OrigTripCounts[i];
5041 Type *IVType = OrigTripCount->getType();
5042
5043 Value *FloorTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
5044 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
5045
5046 // 0 if tripcount divides the tilesize, 1 otherwise.
5047 // 1 means we need an additional iteration for a partial tile.
5048 //
5049 // Unfortunately we cannot just use the roundup-formula
5050 // (tripcount + tilesize - 1)/tilesize
5051 // because the summation might overflow. We do not want introduce undefined
5052 // behavior when the untiled loop nest did not.
5053 Value *FloorTripOverflow =
5054 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
5055
5056 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
5057 FloorTripCount =
5058 Builder.CreateAdd(FloorTripCount, FloorTripOverflow,
5059 "omp_floor" + Twine(i) + ".tripcount", true);
5060
5061 // Remember some values for later use.
5062 FloorCount.push_back(FloorTripCount);
5063 FloorRems.push_back(FloorTripRem);
5064 }
5065
5066 // Generate the new loop nest, from the outermost to the innermost.
5067 std::vector<CanonicalLoopInfo *> Result;
5068 Result.reserve(NumLoops * 2);
5069
5070 // The basic block of the surrounding loop that enters the nest generated
5071 // loop.
5072 BasicBlock *Enter = OutermostLoop->getPreheader();
5073
5074 // The basic block of the surrounding loop where the inner code should
5075 // continue.
5076 BasicBlock *Continue = OutermostLoop->getAfter();
5077
5078 // Where the next loop basic block should be inserted.
5079 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
5080
5081 auto EmbeddNewLoop =
5082 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
5083 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
5084 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
5085 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
5086 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
5087 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
5088
5089 // Setup the position where the next embedded loop connects to this loop.
5090 Enter = EmbeddedLoop->getBody();
5091 Continue = EmbeddedLoop->getLatch();
5092 OutroInsertBefore = EmbeddedLoop->getLatch();
5093 return EmbeddedLoop;
5094 };
5095
5096 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
5097 const Twine &NameBase) {
5098 for (auto P : enumerate(TripCounts)) {
5099 CanonicalLoopInfo *EmbeddedLoop =
5100 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
5101 Result.push_back(EmbeddedLoop);
5102 }
5103 };
5104
5105 EmbeddNewLoops(FloorCount, "floor");
5106
5107 // Within the innermost floor loop, emit the code that computes the tile
5108 // sizes.
5110 SmallVector<Value *, 4> TileCounts;
5111 for (int i = 0; i < NumLoops; ++i) {
5112 CanonicalLoopInfo *FloorLoop = Result[i];
5113 Value *TileSize = TileSizes[i];
5114
5115 Value *FloorIsEpilogue =
5116 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCount[i]);
5117 Value *TileTripCount =
5118 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
5119
5120 TileCounts.push_back(TileTripCount);
5121 }
5122
5123 // Create the tile loops.
5124 EmbeddNewLoops(TileCounts, "tile");
5125
5126 // Insert the inbetween code into the body.
5127 BasicBlock *BodyEnter = Enter;
5128 BasicBlock *BodyEntered = nullptr;
5129 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
5130 BasicBlock *EnterBB = P.first;
5131 BasicBlock *ExitBB = P.second;
5132
5133 if (BodyEnter)
5134 redirectTo(BodyEnter, EnterBB, DL);
5135 else
5136 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
5137
5138 BodyEnter = nullptr;
5139 BodyEntered = ExitBB;
5140 }
5141
5142 // Append the original loop nest body into the generated loop nest body.
5143 if (BodyEnter)
5144 redirectTo(BodyEnter, InnerEnter, DL);
5145 else
5146 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
5148
5149 // Replace the original induction variable with an induction variable computed
5150 // from the tile and floor induction variables.
5151 Builder.restoreIP(Result.back()->getBodyIP());
5152 for (int i = 0; i < NumLoops; ++i) {
5153 CanonicalLoopInfo *FloorLoop = Result[i];
5154 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
5155 Value *OrigIndVar = OrigIndVars[i];
5156 Value *Size = TileSizes[i];
5157
5158 Value *Scale =
5159 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
5160 Value *Shift =
5161 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
5162 OrigIndVar->replaceAllUsesWith(Shift);
5163 }
5164
5165 // Remove unused parts of the original loops.
5166 removeUnusedBlocksFromParent(OldControlBBs);
5167
5168 for (CanonicalLoopInfo *L : Loops)
5169 L->invalidate();
5170
5171#ifndef NDEBUG
5172 for (CanonicalLoopInfo *GenL : Result)
5173 GenL->assertOK();
5174#endif
5175 return Result;
5176}
5177
5178/// Attach metadata \p Properties to the basic block described by \p BB. If the
5179/// basic block already has metadata, the basic block properties are appended.
5181 ArrayRef<Metadata *> Properties) {
5182 // Nothing to do if no property to attach.
5183 if (Properties.empty())
5184 return;
5185
5186 LLVMContext &Ctx = BB->getContext();
5187 SmallVector<Metadata *> NewProperties;
5188 NewProperties.push_back(nullptr);
5189
5190 // If the basic block already has metadata, prepend it to the new metadata.
5191 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
5192 if (Existing)
5193 append_range(NewProperties, drop_begin(Existing->operands(), 1));
5194
5195 append_range(NewProperties, Properties);
5196 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
5197 BasicBlockID->replaceOperandWith(0, BasicBlockID);
5198
5199 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
5200}
5201
5202/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
5203/// loop already has metadata, the loop properties are appended.
5205 ArrayRef<Metadata *> Properties) {
5206 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
5207
5208 // Attach metadata to the loop's latch
5209 BasicBlock *Latch = Loop->getLatch();
5210 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
5211 addBasicBlockMetadata(Latch, Properties);
5212}
5213
5214/// Attach llvm.access.group metadata to the memref instructions of \p Block
5215static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
5216 LoopInfo &LI) {
5217 for (Instruction &I : *Block) {
5218 if (I.mayReadOrWriteMemory()) {
5219 // TODO: This instruction may already have access group from
5220 // other pragmas e.g. #pragma clang loop vectorize. Append
5221 // so that the existing metadata is not overwritten.
5222 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
5223 }
5224 }
5225}
5226
5230 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5231 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
5232}
5233
5237 Loop, {
5238 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5239 });
5240}
5241
5242void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
5243 Value *IfCond, ValueToValueMapTy &VMap,
5244 const Twine &NamePrefix) {
5245 Function *F = CanonicalLoop->getFunction();
5246
5247 // Define where if branch should be inserted
5248 Instruction *SplitBefore = CanonicalLoop->getPreheader()->getTerminator();
5249
5250 // TODO: We should not rely on pass manager. Currently we use pass manager
5251 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5252 // object. We should have a method which returns all blocks between
5253 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5255 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5256 FAM.registerPass([]() { return LoopAnalysis(); });
5257 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5258
5259 // Get the loop which needs to be cloned
5260 LoopAnalysis LIA;
5261 LoopInfo &&LI = LIA.run(*F, FAM);
5262 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5263
5264 // Create additional blocks for the if statement
5265 BasicBlock *Head = SplitBefore->getParent();
5266 Instruction *HeadOldTerm = Head->getTerminator();
5267 llvm::LLVMContext &C = Head->getContext();
5269 C, NamePrefix + ".if.then", Head->getParent(), Head->getNextNode());
5271 C, NamePrefix + ".if.else", Head->getParent(), CanonicalLoop->getExit());
5272
5273 // Create if condition branch.
5274 Builder.SetInsertPoint(HeadOldTerm);
5275 Instruction *BrInstr =
5276 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
5277 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
5278 // Then block contains branch to omp loop which needs to be vectorized
5279 spliceBB(IP, ThenBlock, false);
5280 ThenBlock->replaceSuccessorsPhiUsesWith(Head, ThenBlock);
5281
5282 Builder.SetInsertPoint(ElseBlock);
5283
5284 // Clone loop for the else branch
5286
5287 VMap[CanonicalLoop->getPreheader()] = ElseBlock;
5288 for (BasicBlock *Block : L->getBlocks()) {
5289 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
5290 NewBB->moveBefore(CanonicalLoop->getExit());
5291 VMap[Block] = NewBB;
5292 NewBlocks.push_back(NewBB);
5293 }
5294 remapInstructionsInBlocks(NewBlocks, VMap);
5295 Builder.CreateBr(NewBlocks.front());
5296}
5297
5298unsigned
5300 const StringMap<bool> &Features) {
5301 if (TargetTriple.isX86()) {
5302 if (Features.lookup("avx512f"))
5303 return 512;
5304 else if (Features.lookup("avx"))
5305 return 256;
5306 return 128;
5307 }
5308 if (TargetTriple.isPPC())
5309 return 128;
5310 if (TargetTriple.isWasm())
5311 return 128;
5312 return 0;
5313}
5314
5316 MapVector<Value *, Value *> AlignedVars,
5317 Value *IfCond, OrderKind Order,
5318 ConstantInt *Simdlen, ConstantInt *Safelen) {
5320
5321 Function *F = CanonicalLoop->getFunction();
5322
5323 // TODO: We should not rely on pass manager. Currently we use pass manager
5324 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5325 // object. We should have a method which returns all blocks between
5326 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5328 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5329 FAM.registerPass([]() { return LoopAnalysis(); });
5330 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5331
5332 LoopAnalysis LIA;
5333 LoopInfo &&LI = LIA.run(*F, FAM);
5334
5335 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5336 if (AlignedVars.size()) {
5338 for (auto &AlignedItem : AlignedVars) {
5339 Value *AlignedPtr = AlignedItem.first;
5340 Value *Alignment = AlignedItem.second;
5341 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
5342 Builder.SetInsertPoint(loadInst->getNextNode());
5343 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
5344 Alignment);
5345 }
5346 Builder.restoreIP(IP);
5347 }
5348
5349 if (IfCond) {
5350 ValueToValueMapTy VMap;
5351 createIfVersion(CanonicalLoop, IfCond, VMap, "simd");
5352 // Add metadata to the cloned loop which disables vectorization
5353 Value *MappedLatch = VMap.lookup(CanonicalLoop->getLatch());
5354 assert(MappedLatch &&
5355 "Cannot find value which corresponds to original loop latch");
5356 assert(isa<BasicBlock>(MappedLatch) &&
5357 "Cannot cast mapped latch block value to BasicBlock");
5358 BasicBlock *NewLatchBlock = dyn_cast<BasicBlock>(MappedLatch);
5359 ConstantAsMetadata *BoolConst =
5362 NewLatchBlock,
5363 {MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"),
5364 BoolConst})});
5365 }
5366
5367 SmallSet<BasicBlock *, 8> Reachable;
5368
5369 // Get the basic blocks from the loop in which memref instructions
5370 // can be found.
5371 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5372 // preferably without running any passes.
5373 for (BasicBlock *Block : L->getBlocks()) {
5374 if (Block == CanonicalLoop->getCond() ||
5375 Block == CanonicalLoop->getHeader())
5376 continue;
5377 Reachable.insert(Block);
5378 }
5379
5380 SmallVector<Metadata *> LoopMDList;
5381
5382 // In presence of finite 'safelen', it may be unsafe to mark all
5383 // the memory instructions parallel, because loop-carried
5384 // dependences of 'safelen' iterations are possible.
5385 // If clause order(concurrent) is specified then the memory instructions
5386 // are marked parallel even if 'safelen' is finite.
5387 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
5388 // Add access group metadata to memory-access instructions.
5389 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5390 for (BasicBlock *BB : Reachable)
5391 addSimdMetadata(BB, AccessGroup, LI);
5392 // TODO: If the loop has existing parallel access metadata, have
5393 // to combine two lists.
5394 LoopMDList.push_back(MDNode::get(
5395 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5396 }
5397
5398 // Use the above access group metadata to create loop level
5399 // metadata, which should be distinct for each loop.
5400 ConstantAsMetadata *BoolConst =
5402 LoopMDList.push_back(MDNode::get(
5403 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
5404
5405 if (Simdlen || Safelen) {
5406 // If both simdlen and safelen clauses are specified, the value of the
5407 // simdlen parameter must be less than or equal to the value of the safelen
5408 // parameter. Therefore, use safelen only in the absence of simdlen.
5409 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
5410 LoopMDList.push_back(
5411 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
5412 ConstantAsMetadata::get(VectorizeWidth)}));
5413 }
5414
5415 addLoopMetadata(CanonicalLoop, LoopMDList);
5416}
5417
5418/// Create the TargetMachine object to query the backend for optimization
5419/// preferences.
5420///
5421/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
5422/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
5423/// needed for the LLVM pass pipline. We use some default options to avoid
5424/// having to pass too many settings from the frontend that probably do not
5425/// matter.
5426///
5427/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
5428/// method. If we are going to use TargetMachine for more purposes, especially
5429/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
5430/// might become be worth requiring front-ends to pass on their TargetMachine,
5431/// or at least cache it between methods. Note that while fontends such as Clang
5432/// have just a single main TargetMachine per translation unit, "target-cpu" and
5433/// "target-features" that determine the TargetMachine are per-function and can
5434/// be overrided using __attribute__((target("OPTIONS"))).
5435static std::unique_ptr<TargetMachine>
5437 Module *M = F->getParent();
5438
5439 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
5440 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
5441 const std::string &Triple = M->getTargetTriple();
5442
5443 std::string Error;
5445 if (!TheTarget)
5446 return {};
5447
5449 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
5450 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
5451 /*CodeModel=*/std::nullopt, OptLevel));
5452}
5453
5454/// Heuristically determine the best-performant unroll factor for \p CLI. This
5455/// depends on the target processor. We are re-using the same heuristics as the
5456/// LoopUnrollPass.
5458 Function *F = CLI->getFunction();
5459
5460 // Assume the user requests the most aggressive unrolling, even if the rest of
5461 // the code is optimized using a lower setting.
5463 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
5464
5466 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
5467 FAM.registerPass([]() { return AssumptionAnalysis(); });
5468 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5469 FAM.registerPass([]() { return LoopAnalysis(); });
5470 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
5471 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5472 TargetIRAnalysis TIRA;
5473 if (TM)
5474 TIRA = TargetIRAnalysis(
5475 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
5476 FAM.registerPass([&]() { return TIRA; });
5477
5478 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
5480 ScalarEvolution &&SE = SEA.run(*F, FAM);
5482 DominatorTree &&DT = DTA.run(*F, FAM);
5483 LoopAnalysis LIA;
5484 LoopInfo &&LI = LIA.run(*F, FAM);
5486 AssumptionCache &&AC = ACT.run(*F, FAM);
5488
5489 Loop *L = LI.getLoopFor(CLI->getHeader());
5490 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
5491
5493 L, SE, TTI,
5494 /*BlockFrequencyInfo=*/nullptr,
5495 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
5496 /*UserThreshold=*/std::nullopt,
5497 /*UserCount=*/std::nullopt,
5498 /*UserAllowPartial=*/true,
5499 /*UserAllowRuntime=*/true,
5500 /*UserUpperBound=*/std::nullopt,
5501 /*UserFullUnrollMaxCount=*/std::nullopt);
5502
5503 UP.Force = true;
5504
5505 // Account for additional optimizations taking place before the LoopUnrollPass
5506 // would unroll the loop.
5509
5510 // Use normal unroll factors even if the rest of the code is optimized for
5511 // size.
5514
5515 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
5516 << " Threshold=" << UP.Threshold << "\n"
5517 << " PartialThreshold=" << UP.PartialThreshold << "\n"
5518 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
5519 << " PartialOptSizeThreshold="
5520 << UP.PartialOptSizeThreshold << "\n");
5521
5522 // Disable peeling.
5525 /*UserAllowPeeling=*/false,
5526 /*UserAllowProfileBasedPeeling=*/false,
5527 /*UnrollingSpecficValues=*/false);
5528
5530 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
5531
5532 // Assume that reads and writes to stack variables can be eliminated by
5533 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
5534 // size.
5535 for (BasicBlock *BB : L->blocks()) {
5536 for (Instruction &I : *BB) {
5537 Value *Ptr;
5538 if (auto *Load = dyn_cast<LoadInst>(&I)) {
5539 Ptr = Load->getPointerOperand();
5540 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5541 Ptr = Store->getPointerOperand();
5542 } else
5543 continue;
5544
5545 Ptr = Ptr->stripPointerCasts();
5546
5547 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
5548 if (Alloca->getParent() == &F->getEntryBlock())
5549 EphValues.insert(&I);
5550 }
5551 }
5552 }
5553
5554 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
5555
5556 // Loop is not unrollable if the loop contains certain instructions.
5557 if (!UCE.canUnroll()) {
5558 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
5559 return 1;
5560 }
5561
5562 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
5563 << "\n");
5564
5565 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
5566 // be able to use it.
5567 int TripCount = 0;
5568 int MaxTripCount = 0;
5569 bool MaxOrZero = false;
5570 unsigned TripMultiple = 0;
5571
5572 bool UseUpperBound = false;
5573 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
5574 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
5575 UseUpperBound);
5576 unsigned Factor = UP.Count;
5577 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
5578
5579 // This function returns 1 to signal to not unroll a loop.
5580 if (Factor == 0)
5581 return 1;
5582 return Factor;
5583}
5584
5586 int32_t Factor,
5587 CanonicalLoopInfo **UnrolledCLI) {
5588 assert(Factor >= 0 && "Unroll factor must not be negative");
5589
5590 Function *F = Loop->getFunction();
5591 LLVMContext &Ctx = F->getContext();
5592
5593 // If the unrolled loop is not used for another loop-associated directive, it
5594 // is sufficient to add metadata for the LoopUnrollPass.
5595 if (!UnrolledCLI) {
5596 SmallVector<Metadata *, 2> LoopMetadata;
5597 LoopMetadata.push_back(
5598 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
5599
5600 if (Factor >= 1) {
5602 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
5603 LoopMetadata.push_back(MDNode::get(
5604 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
5605 }
5606
5607 addLoopMetadata(Loop, LoopMetadata);
5608 return;
5609 }
5610
5611 // Heuristically determine the unroll factor.
5612 if (Factor == 0)
5614
5615 // No change required with unroll factor 1.
5616 if (Factor == 1) {
5617 *UnrolledCLI = Loop;
5618 return;
5619 }
5620
5621 assert(Factor >= 2 &&
5622 "unrolling only makes sense with a factor of 2 or larger");
5623
5624 Type *IndVarTy = Loop->getIndVarType();
5625
5626 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
5627 // unroll the inner loop.
5628 Value *FactorVal =
5629 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
5630 /*isSigned=*/false));
5631 std::vector<CanonicalLoopInfo *> LoopNest =
5632 tileLoops(DL, {Loop}, {FactorVal});
5633 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
5634 *UnrolledCLI = LoopNest[0];
5635 CanonicalLoopInfo *InnerLoop = LoopNest[1];
5636
5637 // LoopUnrollPass can only fully unroll loops with constant trip count.
5638 // Unroll by the unroll factor with a fallback epilog for the remainder
5639 // iterations if necessary.
5641 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
5643 InnerLoop,
5644 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5646 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
5647
5648#ifndef NDEBUG
5649 (*UnrolledCLI)->assertOK();
5650#endif
5651}
5652
5655 llvm::Value *BufSize, llvm::Value *CpyBuf,
5656 llvm::Value *CpyFn, llvm::Value *DidIt) {
5657 if (!updateToLocation(Loc))
5658 return Loc.IP;
5659
5660 uint32_t SrcLocStrSize;
5661 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5662 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5663 Value *ThreadId = getOrCreateThreadID(Ident);
5664
5665 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
5666
5667 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
5668
5669 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
5670 Builder.CreateCall(Fn, Args);
5671
5672 return Builder.saveIP();
5673}
5674
5676 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5677 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
5679
5680 if (!updateToLocation(Loc))
5681 return Loc.IP;
5682
5683 // If needed allocate and initialize `DidIt` with 0.
5684 // DidIt: flag variable: 1=single thread; 0=not single thread.
5685 llvm::Value *DidIt = nullptr;
5686 if (!CPVars.empty()) {
5689 }
5690
5691 Directive OMPD = Directive::OMPD_single;
5692 uint32_t SrcLocStrSize;
5693 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5694 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5695 Value *ThreadId = getOrCreateThreadID(Ident);
5696 Value *Args[] = {Ident, ThreadId};
5697
5698 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
5699 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
5700
5701 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
5702 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5703
5704 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
5705 if (Error Err = FiniCB(IP))
5706 return Err;
5707
5708 // The thread that executes the single region must set `DidIt` to 1.
5709 // This is used by __kmpc_copyprivate, to know if the caller is the
5710 // single thread or not.
5711 if (DidIt)
5713
5714 return Error::success();
5715 };
5716
5717 // generates the following:
5718 // if (__kmpc_single()) {
5719 // .... single region ...
5720 // __kmpc_end_single
5721 // }
5722 // __kmpc_copyprivate
5723 // __kmpc_barrier
5724
5725 InsertPointOrErrorTy AfterIP =
5726 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
5727 /*Conditional*/ true,
5728 /*hasFinalize*/ true);
5729 if (!AfterIP)
5730 return AfterIP.takeError();
5731
5732 if (DidIt) {
5733 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
5734 // NOTE BufSize is currently unused, so just pass 0.
5736 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
5737 CPFuncs[I], DidIt);
5738 // NOTE __kmpc_copyprivate already inserts a barrier
5739 } else if (!IsNowait) {
5740 InsertPointOrErrorTy AfterIP =
5742 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
5743 /* CheckCancelFlag */ false);
5744 if (!AfterIP)
5745 return AfterIP.takeError();
5746 }
5747 return Builder.saveIP();
5748}
5749
5751 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5752 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
5753
5754 if (!updateToLocation(Loc))
5755 return Loc.IP;
5756
5757 Directive OMPD = Directive::OMPD_critical;
5758 uint32_t SrcLocStrSize;
5759 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5760 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5761 Value *ThreadId = getOrCreateThreadID(Ident);
5762 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
5763 Value *Args[] = {Ident, ThreadId, LockVar};
5764
5765 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
5766 Function *RTFn = nullptr;
5767 if (HintInst) {
5768 // Add Hint to entry Args and create call
5769 EnterArgs.push_back(HintInst);
5770 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
5771 } else {
5772 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
5773 }
5774 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
5775
5776 Function *ExitRTLFn =
5777 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
5778 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5779
5780 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5781 /*Conditional*/ false, /*hasFinalize*/ true);
5782}
5783
5786 InsertPointTy AllocaIP, unsigned NumLoops,
5787 ArrayRef<llvm::Value *> StoreValues,
5788 const Twine &Name, bool IsDependSource) {
5789 assert(
5790 llvm::all_of(StoreValues,
5791 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
5792 "OpenMP runtime requires depend vec with i64 type");
5793
5794 if (!updateToLocation(Loc))
5795 return Loc.IP;
5796
5797 // Allocate space for vector and generate alloc instruction.
5798 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
5799 Builder.restoreIP(AllocaIP);
5800 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
5801 ArgsBase->setAlignment(Align(8));
5802 Builder.restoreIP(Loc.IP);
5803
5804 // Store the index value with offset in depend vector.
5805 for (unsigned I = 0; I < NumLoops; ++I) {
5806 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
5807 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
5808 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
5809 STInst->setAlignment(Align(8));
5810 }
5811
5812 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
5813 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
5814
5815 uint32_t SrcLocStrSize;
5816 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5817 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5818 Value *ThreadId = getOrCreateThreadID(Ident);
5819 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
5820
5821 Function *RTLFn = nullptr;
5822 if (IsDependSource)
5823 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
5824 else
5825 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
5826 Builder.CreateCall(RTLFn, Args);
5827
5828 return Builder.saveIP();
5829}
5830
5832 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5833 FinalizeCallbackTy FiniCB, bool IsThreads) {
5834 if (!updateToLocation(Loc))
5835 return Loc.IP;
5836
5837 Directive OMPD = Directive::OMPD_ordered;
5838 Instruction *EntryCall = nullptr;
5839 Instruction *ExitCall = nullptr;
5840
5841 if (IsThreads) {
5842 uint32_t SrcLocStrSize;
5843 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5844 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5845 Value *ThreadId = getOrCreateThreadID(Ident);
5846 Value *Args[] = {Ident, ThreadId};
5847
5848 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
5849 EntryCall = Builder.CreateCall(EntryRTLFn, Args);
5850
5851 Function *ExitRTLFn =
5852 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
5853 ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5854 }
5855
5856 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5857 /*Conditional*/ false, /*hasFinalize*/ true);
5858}
5859
5860OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
5861 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
5862 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
5863 bool HasFinalize, bool IsCancellable) {
5864
5865 if (HasFinalize)
5866 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
5867
5868 // Create inlined region's entry and body blocks, in preparation
5869 // for conditional creation
5870 BasicBlock *EntryBB = Builder.GetInsertBlock();
5871 Instruction *SplitPos = EntryBB->getTerminator();
5872 if (!isa_and_nonnull<BranchInst>(SplitPos))
5873 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
5874 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
5875 BasicBlock *FiniBB =
5876 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
5877
5879 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
5880
5881 // generate body
5882 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
5883 /* CodeGenIP */ Builder.saveIP()))
5884 return Err;
5885
5886 // emit exit call and do any needed finalization.
5887 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
5888 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
5889 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
5890 "Unexpected control flow graph state!!");
5891 InsertPointOrErrorTy AfterIP =
5892 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
5893 if (!AfterIP)
5894 return AfterIP.takeError();
5895 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
5896 "Unexpected Control Flow State!");
5898
5899 // If we are skipping the region of a non conditional, remove the exit
5900 // block, and clear the builder's insertion point.
5901 assert(SplitPos->getParent() == ExitBB &&
5902 "Unexpected Insertion point location!");
5903 auto merged = MergeBlockIntoPredecessor(ExitBB);
5904 BasicBlock *ExitPredBB = SplitPos->getParent();
5905 auto InsertBB = merged ? ExitPredBB : ExitBB;
5906 if (!isa_and_nonnull<BranchInst>(SplitPos))
5907 SplitPos->eraseFromParent();
5908 Builder.SetInsertPoint(InsertBB);
5909
5910 return Builder.saveIP();
5911}
5912
5913OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
5914 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
5915 // if nothing to do, Return current insertion point.
5916 if (!Conditional || !EntryCall)
5917 return Builder.saveIP();
5918
5919 BasicBlock *EntryBB = Builder.GetInsertBlock();
5920 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
5921 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
5922 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
5923
5924 // Emit thenBB and set the Builder's insertion point there for
5925 // body generation next. Place the block after the current block.
5926 Function *CurFn = EntryBB->getParent();
5927 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
5928
5929 // Move Entry branch to end of ThenBB, and replace with conditional
5930 // branch (If-stmt)
5931 Instruction *EntryBBTI = EntryBB->getTerminator();
5932 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
5933 EntryBBTI->removeFromParent();
5935 Builder.Insert(EntryBBTI);
5936 UI->eraseFromParent();
5938
5939 // return an insertion point to ExitBB.
5940 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
5941}
5942
5943OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
5944 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
5945 bool HasFinalize) {
5946
5947 Builder.restoreIP(FinIP);
5948
5949 // If there is finalization to do, emit it before the exit call
5950 if (HasFinalize) {
5951 assert(!FinalizationStack.empty() &&
5952 "Unexpected finalization stack state!");
5953
5954 FinalizationInfo Fi = FinalizationStack.pop_back_val();
5955 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
5956
5957 if (Error Err = Fi.FiniCB(FinIP))
5958 return Err;
5959
5960 BasicBlock *FiniBB = FinIP.getBlock();
5961 Instruction *FiniBBTI = FiniBB->getTerminator();
5962
5963 // set Builder IP for call creation
5964 Builder.SetInsertPoint(FiniBBTI);
5965 }
5966
5967 if (!ExitCall)
5968 return Builder.saveIP();
5969
5970 // place the Exitcall as last instruction before Finalization block terminator
5971 ExitCall->removeFromParent();
5972 Builder.Insert(ExitCall);
5973
5974 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
5975 ExitCall->getIterator());
5976}
5977
5979 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
5980 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
5981 if (!IP.isSet())
5982 return IP;
5983
5985
5986 // creates the following CFG structure
5987 // OMP_Entry : (MasterAddr != PrivateAddr)?
5988 // F T
5989 // | \
5990 // | copin.not.master
5991 // | /
5992 // v /
5993 // copyin.not.master.end
5994 // |
5995 // v
5996 // OMP.Entry.Next
5997
5998 BasicBlock *OMP_Entry = IP.getBlock();
5999 Function *CurFn = OMP_Entry->getParent();
6000 BasicBlock *CopyBegin =
6001 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
6002 BasicBlock *CopyEnd = nullptr;
6003
6004 // If entry block is terminated, split to preserve the branch to following
6005 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
6006 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
6007 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
6008 "copyin.not.master.end");
6009 OMP_Entry->getTerminator()->eraseFromParent();
6010 } else {
6011 CopyEnd =
6012 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
6013 }
6014
6015 Builder.SetInsertPoint(OMP_Entry);
6016 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
6017 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
6018 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
6019 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
6020
6021 Builder.SetInsertPoint(CopyBegin);
6022 if (BranchtoEnd)
6024
6025 return Builder.saveIP();
6026}
6027
6029 Value *Size, Value *Allocator,
6030 std::string Name) {
6032 updateToLocation(Loc);
6033
6034 uint32_t SrcLocStrSize;
6035 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6036 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6037 Value *ThreadId = getOrCreateThreadID(Ident);
6038 Value *Args[] = {ThreadId, Size, Allocator};
6039
6040 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
6041
6042 return Builder.CreateCall(Fn, Args, Name);
6043}
6044
6046 Value *Addr, Value *Allocator,
6047 std::string Name) {
6049 updateToLocation(Loc);
6050
6051 uint32_t SrcLocStrSize;
6052 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6053 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6054 Value *ThreadId = getOrCreateThreadID(Ident);
6055 Value *Args[] = {ThreadId, Addr, Allocator};
6056 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
6057 return Builder.CreateCall(Fn, Args, Name);
6058}
6059
6061 const LocationDescription &Loc, Value *InteropVar,
6062 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
6063 Value *DependenceAddress, bool HaveNowaitClause) {
6065 updateToLocation(Loc);
6066
6067 uint32_t SrcLocStrSize;
6068 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6069 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6070 Value *ThreadId = getOrCreateThreadID(Ident);
6071 if (Device == nullptr)
6073 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
6074 if (NumDependences == nullptr) {
6075 NumDependences = ConstantInt::get(Int32, 0);
6076 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6077 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6078 }
6079 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6080 Value *Args[] = {
6081 Ident, ThreadId, InteropVar, InteropTypeVal,
6082 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
6083
6084 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
6085
6086 return Builder.CreateCall(Fn, Args);
6087}
6088
6090 const LocationDescription &Loc, Value *InteropVar, Value *Device,
6091 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
6093 updateToLocation(Loc);
6094
6095 uint32_t SrcLocStrSize;
6096 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6097 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6098 Value *ThreadId = getOrCreateThreadID(Ident);
6099 if (Device == nullptr)
6101 if (NumDependences == nullptr) {
6102 NumDependences = ConstantInt::get(Int32, 0);
6103 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6104 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6105 }
6106 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6107 Value *Args[] = {
6108 Ident, ThreadId, InteropVar, Device,
6109 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6110
6111 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
6112
6113 return Builder.CreateCall(Fn, Args);
6114}
6115
6117 Value *InteropVar, Value *Device,
6118 Value *NumDependences,
6119 Value *DependenceAddress,
6120 bool HaveNowaitClause) {
6122 updateToLocation(Loc);
6123 uint32_t SrcLocStrSize;
6124 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6125 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6126 Value *ThreadId = getOrCreateThreadID(Ident);
6127 if (Device == nullptr)
6129 if (NumDependences == nullptr) {
6130 NumDependences = ConstantInt::get(Int32, 0);
6131 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6132 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6133 }
6134 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6135 Value *Args[] = {
6136 Ident, ThreadId, InteropVar, Device,
6137 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6138
6139 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
6140
6141 return Builder.CreateCall(Fn, Args);
6142}
6143
6145 const LocationDescription &Loc, llvm::Value *Pointer,
6148 updateToLocation(Loc);
6149
6150 uint32_t SrcLocStrSize;
6151 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6152 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6153 Value *ThreadId = getOrCreateThreadID(Ident);
6154 Constant *ThreadPrivateCache =
6155 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
6156 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
6157
6158 Function *Fn =
6159 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
6160
6161 return Builder.CreateCall(Fn, Args);
6162}
6163
6165 const LocationDescription &Loc,
6167 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
6168 "expected num_threads and num_teams to be specified");
6169
6170 if (!updateToLocation(Loc))
6171 return Loc.IP;
6172
6173 uint32_t SrcLocStrSize;
6174 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6175 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6176 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
6177 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
6178 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
6179 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
6180 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
6181
6182 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
6183 Function *Kernel = DebugKernelWrapper;
6184
6185 // We need to strip the debug prefix to get the correct kernel name.
6186 StringRef KernelName = Kernel->getName();
6187 const std::string DebugPrefix = "_debug__";
6188 if (KernelName.ends_with(DebugPrefix)) {
6189 KernelName = KernelName.drop_back(DebugPrefix.length());
6190 Kernel = M.getFunction(KernelName);
6191 assert(Kernel && "Expected the real kernel to exist");
6192 }
6193
6194 // Manifest the launch configuration in the metadata matching the kernel
6195 // environment.
6196 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
6197 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
6198
6199 // If MaxThreads not set, select the maximum between the default workgroup
6200 // size and the MinThreads value.
6201 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
6202 if (MaxThreadsVal < 0)
6203 MaxThreadsVal = std::max(
6204 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), Attrs.MinThreads);
6205
6206 if (MaxThreadsVal > 0)
6207 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
6208
6209 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
6211 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
6212 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
6213 Constant *ReductionDataSize = ConstantInt::getSigned(Int32, 0);
6214 Constant *ReductionBufferLength = ConstantInt::getSigned(Int32, 0);
6215
6217 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
6218 const DataLayout &DL = Fn->getDataLayout();
6219
6220 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
6221 Constant *DynamicEnvironmentInitializer =
6222 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
6223 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
6224 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
6225 DynamicEnvironmentInitializer, DynamicEnvironmentName,
6226 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6227 DL.getDefaultGlobalsAddressSpace());
6228 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6229
6230 Constant *DynamicEnvironment =
6231 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
6232 ? DynamicEnvironmentGV
6233 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
6234 DynamicEnvironmentPtr);
6235
6236 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
6237 ConfigurationEnvironment, {
6238 UseGenericStateMachineVal,
6239 MayUseNestedParallelismVal,
6240 IsSPMDVal,
6241 MinThreads,
6242 MaxThreads,
6243 MinTeams,
6244 MaxTeams,
6245 ReductionDataSize,
6246 ReductionBufferLength,
6247 });
6248 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
6249 KernelEnvironment, {
6250 ConfigurationEnvironmentInitializer,
6251 Ident,
6252 DynamicEnvironment,
6253 });
6254 std::string KernelEnvironmentName =
6255 (KernelName + "_kernel_environment").str();
6256 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
6257 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
6258 KernelEnvironmentInitializer, KernelEnvironmentName,
6259 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6260 DL.getDefaultGlobalsAddressSpace());
6261 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6262
6263 Constant *KernelEnvironment =
6264 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
6265 ? KernelEnvironmentGV
6266 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
6267 KernelEnvironmentPtr);
6268 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
6269 CallInst *ThreadKind =
6270 Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
6271
6272 Value *ExecUserCode = Builder.CreateICmpEQ(
6273 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
6274 "exec_user_code");
6275
6276 // ThreadKind = __kmpc_target_init(...)
6277 // if (ThreadKind == -1)
6278 // user_code
6279 // else
6280 // return;
6281
6282 auto *UI = Builder.CreateUnreachable();
6283 BasicBlock *CheckBB = UI->getParent();
6284 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
6285
6286 BasicBlock *WorkerExitBB = BasicBlock::Create(
6287 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
6288 Builder.SetInsertPoint(WorkerExitBB);
6290
6291 auto *CheckBBTI = CheckBB->getTerminator();
6292 Builder.SetInsertPoint(CheckBBTI);
6293 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
6294
6295 CheckBBTI->eraseFromParent();
6296 UI->eraseFromParent();
6297
6298 // Continue in the "user_code" block, see diagram above and in
6299 // openmp/libomptarget/deviceRTLs/common/include/target.h .
6300 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
6301}
6302
6304 int32_t TeamsReductionDataSize,
6305 int32_t TeamsReductionBufferLength) {
6306 if (!updateToLocation(Loc))
6307 return;
6308
6310 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
6311
6312 Builder.CreateCall(Fn, {});
6313
6314 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
6315 return;
6316
6318 // We need to strip the debug prefix to get the correct kernel name.
6319 StringRef KernelName = Kernel->getName();
6320 const std::string DebugPrefix = "_debug__";
6321 if (KernelName.ends_with(DebugPrefix))
6322 KernelName = KernelName.drop_back(DebugPrefix.length());
6323 auto *KernelEnvironmentGV =
6324 M.getNamedGlobal((KernelName + "_kernel_environment").str());
6325 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
6326 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
6327 auto *NewInitializer = ConstantFoldInsertValueInstruction(
6328 KernelEnvironmentInitializer,
6329 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
6330 NewInitializer = ConstantFoldInsertValueInstruction(
6331 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
6332 {0, 8});
6333 KernelEnvironmentGV->setInitializer(NewInitializer);
6334}
6335
6337 Module &M = *Kernel.getParent();
6338 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6339 for (auto *Op : MD->operands()) {
6340 if (Op->getNumOperands() != 3)
6341 continue;
6342 auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
6343 if (!KernelOp || KernelOp->getValue() != &Kernel)
6344 continue;
6345 auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
6346 if (!Prop || Prop->getString() != Name)
6347 continue;
6348 return Op;
6349 }
6350 return nullptr;
6351}
6352
6354 bool Min) {
6355 // Update the "maxntidx" metadata for NVIDIA, or add it.
6356 MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name);
6357 if (ExistingOp) {
6358 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
6359 int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
6360 ExistingOp->replaceOperandWith(
6361 2, ConstantAsMetadata::get(ConstantInt::get(
6362 OldVal->getValue()->getType(),
6363 Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value))));
6364 } else {
6365 LLVMContext &Ctx = Kernel.getContext();
6367 MDString::get(Ctx, Name),
6369 ConstantInt::get(Type::getInt32Ty(Ctx), Value))};
6370 // Append metadata to nvvm.annotations
6371 Module &M = *Kernel.getParent();
6372 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6373 MD->addOperand(MDNode::get(Ctx, MDVals));
6374 }
6375}
6376
6377std::pair<int32_t, int32_t>
6379 int32_t ThreadLimit =
6380 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
6381
6382 if (T.isAMDGPU()) {
6383 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
6384 if (!Attr.isValid() || !Attr.isStringAttribute())
6385 return {0, ThreadLimit};
6386 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
6387 int32_t LB, UB;
6388 if (!llvm::to_integer(UBStr, UB, 10))
6389 return {0, ThreadLimit};
6390 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
6391 if (!llvm::to_integer(LBStr, LB, 10))
6392 return {0, UB};
6393 return {LB, UB};
6394 }
6395
6396 if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) {
6397 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
6398 int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
6399 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
6400 }
6401 return {0, ThreadLimit};
6402}
6403
6405 Function &Kernel, int32_t LB,
6406 int32_t UB) {
6407 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
6408
6409 if (T.isAMDGPU()) {
6410 Kernel.addFnAttr("amdgpu-flat-work-group-size",
6411 llvm::utostr(LB) + "," + llvm::utostr(UB));
6412 return;
6413 }
6414
6415 updateNVPTXMetadata(Kernel, "maxntidx", UB, true);
6416}
6417
6418std::pair<int32_t, int32_t>
6420 // TODO: Read from backend annotations if available.
6421 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
6422}
6423
6425 int32_t LB, int32_t UB) {
6426 if (T.isNVPTX())
6427 if (UB > 0)
6428 updateNVPTXMetadata(Kernel, "maxclusterrank", UB, true);
6429 if (T.isAMDGPU())
6430 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
6431
6432 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
6433}
6434
6435void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
6436 Function *OutlinedFn) {
6437 if (Config.isTargetDevice()) {
6439 // TODO: Determine if DSO local can be set to true.
6440 OutlinedFn->setDSOLocal(false);
6442 if (T.isAMDGCN())
6444 }
6445}
6446
6447Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
6448 StringRef EntryFnIDName) {
6449 if (Config.isTargetDevice()) {
6450 assert(OutlinedFn && "The outlined function must exist if embedded");
6451 return OutlinedFn;
6452 }
6453
6454 return new GlobalVariable(
6455 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
6456 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
6457}
6458
6459Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
6460 StringRef EntryFnName) {
6461 if (OutlinedFn)
6462 return OutlinedFn;
6463
6464 assert(!M.getGlobalVariable(EntryFnName, true) &&
6465 "Named kernel already exists?");
6466 return new GlobalVariable(
6467 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
6468 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
6469}
6470
6472 TargetRegionEntryInfo &EntryInfo,
6473 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
6474 Function *&OutlinedFn, Constant *&OutlinedFnID) {
6475
6476 SmallString<64> EntryFnName;
6477 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
6478
6480 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
6481 if (!CBResult)
6482 return CBResult.takeError();
6483 OutlinedFn = *CBResult;
6484 } else {
6485 OutlinedFn = nullptr;
6486 }
6487
6488 // If this target outline function is not an offload entry, we don't need to
6489 // register it. This may be in the case of a false if clause, or if there are
6490 // no OpenMP targets.
6491 if (!IsOffloadEntry)
6492 return Error::success();
6493
6494 std::string EntryFnIDName =
6496 ? std::string(EntryFnName)
6497 : createPlatformSpecificName({EntryFnName, "region_id"});
6498
6499 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
6500 EntryFnName, EntryFnIDName);
6501 return Error::success();
6502}
6503
6505 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
6506 StringRef EntryFnName, StringRef EntryFnIDName) {
6507 if (OutlinedFn)
6508 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
6509 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
6510 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
6512 EntryInfo, EntryAddr, OutlinedFnID,
6514 return OutlinedFnID;
6515}
6516
6518 const LocationDescription &Loc, InsertPointTy AllocaIP,
6519 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
6520 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
6521 omp::RuntimeFunction *MapperFunc,
6523 BodyGenTy BodyGenType)>
6524 BodyGenCB,
6525 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
6526 function_ref<Value *(unsigned int)> CustomMapperCB, Value *SrcLocInfo) {
6527 if (!updateToLocation(Loc))
6528 return InsertPointTy();
6529
6530 Builder.restoreIP(CodeGenIP);
6531 // Disable TargetData CodeGen on Device pass.
6532 if (Config.IsTargetDevice.value_or(false)) {
6533 if (BodyGenCB) {
6534 InsertPointOrErrorTy AfterIP =
6535 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
6536 if (!AfterIP)
6537 return AfterIP.takeError();
6538 Builder.restoreIP(*AfterIP);
6539 }
6540 return Builder.saveIP();
6541 }
6542
6543 bool IsStandAlone = !BodyGenCB;
6544 MapInfosTy *MapInfo;
6545 // Generate the code for the opening of the data environment. Capture all the
6546 // arguments of the runtime call by reference because they are used in the
6547 // closing of the region.
6548 auto BeginThenGen = [&](InsertPointTy AllocaIP,
6549 InsertPointTy CodeGenIP) -> Error {
6550 MapInfo = &GenMapInfoCB(Builder.saveIP());
6551 emitOffloadingArrays(AllocaIP, Builder.saveIP(), *MapInfo, Info,
6552 /*IsNonContiguous=*/true, DeviceAddrCB,
6553 CustomMapperCB);
6554
6555 TargetDataRTArgs RTArgs;
6557
6558 // Emit the number of elements in the offloading arrays.
6559 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
6560
6561 // Source location for the ident struct
6562 if (!SrcLocInfo) {
6563 uint32_t SrcLocStrSize;
6564 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6565 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6566 }
6567
6568 SmallVector<llvm::Value *, 13> OffloadingArgs = {
6569 SrcLocInfo, DeviceID,
6570 PointerNum, RTArgs.BasePointersArray,
6571 RTArgs.PointersArray, RTArgs.SizesArray,
6572 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
6573 RTArgs.MappersArray};
6574
6575 if (IsStandAlone) {
6576 assert(MapperFunc && "MapperFunc missing for standalone target data");
6577
6578 auto TaskBodyCB = [&](Value *, Value *,
6580 if (Info.HasNoWait) {
6581 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
6585 }
6586
6588 OffloadingArgs);
6589
6590 if (Info.HasNoWait) {
6591 BasicBlock *OffloadContBlock =
6592 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
6594 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
6596 }
6597 return Error::success();
6598 };
6599
6600 bool RequiresOuterTargetTask = Info.HasNoWait;
6601 if (!RequiresOuterTargetTask)
6602 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
6603 /*TargetTaskAllocaIP=*/{}));
6604 else
6605 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
6606 /*Dependencies=*/{}, Info.HasNoWait));
6607 } else {
6608 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
6609 omp::OMPRTL___tgt_target_data_begin_mapper);
6610
6611 Builder.CreateCall(BeginMapperFunc, OffloadingArgs);
6612
6613 for (auto DeviceMap : Info.DevicePtrInfoMap) {
6614 if (isa<AllocaInst>(DeviceMap.second.second)) {
6615 auto *LI =
6616 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
6617 Builder.CreateStore(LI, DeviceMap.second.second);
6618 }
6619 }
6620
6621 // If device pointer privatization is required, emit the body of the
6622 // region here. It will have to be duplicated: with and without
6623 // privatization.
6624 InsertPointOrErrorTy AfterIP =
6625 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
6626 if (!AfterIP)
6627 return AfterIP.takeError();
6628 Builder.restoreIP(*AfterIP);
6629 }
6630 return Error::success();
6631 };
6632
6633 // If we need device pointer privatization, we need to emit the body of the
6634 // region with no privatization in the 'else' branch of the conditional.
6635 // Otherwise, we don't have to do anything.
6636 auto BeginElseGen = [&](InsertPointTy AllocaIP,
6637 InsertPointTy CodeGenIP) -> Error {
6638 InsertPointOrErrorTy AfterIP =
6639 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
6640 if (!AfterIP)
6641 return AfterIP.takeError();
6642 Builder.restoreIP(*AfterIP);
6643 return Error::success();
6644 };
6645
6646 // Generate code for the closing of the data region.
6647 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6648 TargetDataRTArgs RTArgs;
6649 Info.EmitDebug = !MapInfo->Names.empty();
6650 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
6651
6652 // Emit the number of elements in the offloading arrays.
6653 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
6654
6655 // Source location for the ident struct
6656 if (!SrcLocInfo) {
6657 uint32_t SrcLocStrSize;
6658 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6659 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6660 }
6661
6662 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
6663 PointerNum, RTArgs.BasePointersArray,
6664 RTArgs.PointersArray, RTArgs.SizesArray,
6665 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
6666 RTArgs.MappersArray};
6667 Function *EndMapperFunc =
6668 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
6669
6670 Builder.CreateCall(EndMapperFunc, OffloadingArgs);
6671 return Error::success();
6672 };
6673
6674 // We don't have to do anything to close the region if the if clause evaluates
6675 // to false.
6676 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6677 return Error::success();
6678 };
6679
6680 Error Err = [&]() -> Error {
6681 if (BodyGenCB) {
6682 Error Err = [&]() {
6683 if (IfCond)
6684 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
6685 return BeginThenGen(AllocaIP, Builder.saveIP());
6686 }();
6687
6688 if (Err)
6689 return Err;
6690
6691 // If we don't require privatization of device pointers, we emit the body
6692 // in between the runtime calls. This avoids duplicating the body code.
6693 InsertPointOrErrorTy AfterIP =
6694 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
6695 if (!AfterIP)
6696 return AfterIP.takeError();
6697 Builder.restoreIP(*AfterIP);
6698
6699 if (IfCond)
6700 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
6701 return EndThenGen(AllocaIP, Builder.saveIP());
6702 }
6703 if (IfCond)
6704 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
6705 return BeginThenGen(AllocaIP, Builder.saveIP());
6706 }();
6707
6708 if (Err)
6709 return Err;
6710
6711 return Builder.saveIP();
6712}
6713
6716 bool IsGPUDistribute) {
6717 assert((IVSize == 32 || IVSize == 64) &&
6718 "IV size is not compatible with the omp runtime");
6720 if (IsGPUDistribute)
6721 Name = IVSize == 32
6722 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
6723 : omp::OMPRTL___kmpc_distribute_static_init_4u)
6724 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
6725 : omp::OMPRTL___kmpc_distribute_static_init_8u);
6726 else
6727 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
6728 : omp::OMPRTL___kmpc_for_static_init_4u)
6729 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
6730 : omp::OMPRTL___kmpc_for_static_init_8u);
6731
6733}
6734
6736 bool IVSigned) {
6737 assert((IVSize == 32 || IVSize == 64) &&
6738 "IV size is not compatible with the omp runtime");
6739 RuntimeFunction Name = IVSize == 32
6740 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
6741 : omp::OMPRTL___kmpc_dispatch_init_4u)
6742 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
6743 : omp::OMPRTL___kmpc_dispatch_init_8u);
6744
6746}
6747
6749 bool IVSigned) {
6750 assert((IVSize == 32 || IVSize == 64) &&
6751 "IV size is not compatible with the omp runtime");
6752 RuntimeFunction Name = IVSize == 32
6753 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
6754 : omp::OMPRTL___kmpc_dispatch_next_4u)
6755 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
6756 : omp::OMPRTL___kmpc_dispatch_next_8u);
6757
6759}
6760
6762 bool IVSigned) {
6763 assert((IVSize == 32 || IVSize == 64) &&
6764 "IV size is not compatible with the omp runtime");
6765 RuntimeFunction Name = IVSize == 32
6766 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
6767 : omp::OMPRTL___kmpc_dispatch_fini_4u)
6768 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
6769 : omp::OMPRTL___kmpc_dispatch_fini_8u);
6770
6772}
6773
6775 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
6776}
6777
6779 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
6781 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
6784 SmallVector<Type *> ParameterTypes;
6785 if (OMPBuilder.Config.isTargetDevice()) {
6786 // Add the "implicit" runtime argument we use to provide launch specific
6787 // information for target devices.
6788 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
6789 ParameterTypes.push_back(Int8PtrTy);
6790
6791 // All parameters to target devices are passed as pointers
6792 // or i64. This assumes 64-bit address spaces/pointers.
6793 for (auto &Arg : Inputs)
6794 ParameterTypes.push_back(Arg->getType()->isPointerTy()
6795 ? Arg->getType()
6796 : Type::getInt64Ty(Builder.getContext()));
6797 } else {
6798 for (auto &Arg : Inputs)
6799 ParameterTypes.push_back(Arg->getType());
6800 }
6801
6802 auto BB = Builder.GetInsertBlock();
6803 auto M = BB->getModule();
6804 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
6805 /*isVarArg*/ false);
6806 auto Func =
6807 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
6808
6809 // Forward target-cpu and target-features function attributes from the
6810 // original function to the new outlined function.
6811 Function *ParentFn = Builder.GetInsertBlock()->getParent();
6812
6813 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
6814 if (TargetCpuAttr.isStringAttribute())
6815 Func->addFnAttr(TargetCpuAttr);
6816
6817 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
6818 if (TargetFeaturesAttr.isStringAttribute())
6819 Func->addFnAttr(TargetFeaturesAttr);
6820
6821 if (OMPBuilder.Config.isTargetDevice()) {
6822 Value *ExecMode =
6823 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
6824 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
6825 }
6826
6827 // Save insert point.
6828 IRBuilder<>::InsertPointGuard IPG(Builder);
6829 // If there's a DISubprogram associated with current function, then
6830 // generate one for the outlined function.
6831 if (Function *ParentFunc = BB->getParent()) {
6832 if (DISubprogram *SP = ParentFunc->getSubprogram()) {
6833 DICompileUnit *CU = SP->getUnit();
6834 DIBuilder DB(*M, true, CU);
6836 if (DL) {
6837 // TODO: We are using nullopt for arguments at the moment. This will
6838 // need to be updated when debug data is being generated for variables.
6839 DISubroutineType *Ty =
6840 DB.createSubroutineType(DB.getOrCreateTypeArray({}));
6841 DISubprogram::DISPFlags SPFlags = DISubprogram::SPFlagDefinition |
6842 DISubprogram::SPFlagOptimized |
6843 DISubprogram::SPFlagLocalToUnit;
6844
6845 DISubprogram *OutlinedSP = DB.createFunction(
6846 CU, FuncName, FuncName, SP->getFile(), DL.getLine(), Ty,
6847 DL.getLine(), DINode::DIFlags::FlagArtificial, SPFlags);
6848
6849 // Attach subprogram to the function.
6850 Func->setSubprogram(OutlinedSP);
6851 // Update the CurrentDebugLocation in the builder so that right scope
6852 // is used for things inside outlined function.
6854 DILocation::get(Func->getContext(), DL.getLine(), DL.getCol(),
6855 OutlinedSP, DL.getInlinedAt()));
6856 }
6857 }
6858 }
6859
6860 // Generate the region into the function.
6861 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
6862 Builder.SetInsertPoint(EntryBB);
6863
6864 // Insert target init call in the device compilation pass.
6865 if (OMPBuilder.Config.isTargetDevice())
6866 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
6867
6868 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
6869
6870 // As we embed the user code in the middle of our target region after we
6871 // generate entry code, we must move what allocas we can into the entry
6872 // block to avoid possible breaking optimisations for device
6873 if (OMPBuilder.Config.isTargetDevice())
6875
6876 // Insert target deinit call in the device compilation pass.
6877 BasicBlock *OutlinedBodyBB =
6878 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
6880 Builder.saveIP(),
6881 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
6882 if (!AfterIP)
6883 return AfterIP.takeError();
6884 Builder.restoreIP(*AfterIP);
6885 if (OMPBuilder.Config.isTargetDevice())
6886 OMPBuilder.createTargetDeinit(Builder);
6887
6888 // Insert return instruction.
6889 Builder.CreateRetVoid();
6890
6891 // New Alloca IP at entry point of created device function.
6892 Builder.SetInsertPoint(EntryBB->getFirstNonPHI());
6893 auto AllocaIP = Builder.saveIP();
6894
6895 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
6896
6897 // Skip the artificial dyn_ptr on the device.
6898 const auto &ArgRange =
6899 OMPBuilder.Config.isTargetDevice()
6900 ? make_range(Func->arg_begin() + 1, Func->arg_end())
6901 : Func->args();
6902
6903 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
6904 // Things like GEP's can come in the form of Constants. Constants and
6905 // ConstantExpr's do not have access to the knowledge of what they're
6906 // contained in, so we must dig a little to find an instruction so we
6907 // can tell if they're used inside of the function we're outlining. We
6908 // also replace the original constant expression with a new instruction
6909 // equivalent; an instruction as it allows easy modification in the
6910 // following loop, as we can now know the constant (instruction) is
6911 // owned by our target function and replaceUsesOfWith can now be invoked
6912 // on it (cannot do this with constants it seems). A brand new one also
6913 // allows us to be cautious as it is perhaps possible the old expression
6914 // was used inside of the function but exists and is used externally
6915 // (unlikely by the nature of a Constant, but still).
6916 // NOTE: We cannot remove dead constants that have been rewritten to
6917 // instructions at this stage, we run the risk of breaking later lowering
6918 // by doing so as we could still be in the process of lowering the module
6919 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
6920 // constants we have created rewritten versions of.
6921 if (auto *Const = dyn_cast<Constant>(Input))
6922 convertUsersOfConstantsToInstructions(Const, Func, false);
6923
6924 // Collect all the instructions
6925 for (User *User : make_early_inc_range(Input->users()))
6926 if (auto *Instr = dyn_cast<Instruction>(User))
6927 if (Instr->getFunction() == Func)
6928 Instr->replaceUsesOfWith(Input, InputCopy);
6929 };
6930
6931 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
6932
6933 // Rewrite uses of input valus to parameters.
6934 for (auto InArg : zip(Inputs, ArgRange)) {
6935 Value *Input = std::get<0>(InArg);
6936 Argument &Arg = std::get<1>(InArg);
6937 Value *InputCopy = nullptr;
6938
6940 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
6941 if (!AfterIP)
6942 return AfterIP.takeError();
6943 Builder.restoreIP(*AfterIP);
6944
6945 // In certain cases a Global may be set up for replacement, however, this
6946 // Global may be used in multiple arguments to the kernel, just segmented
6947 // apart, for example, if we have a global array, that is sectioned into
6948 // multiple mappings (technically not legal in OpenMP, but there is a case
6949 // in Fortran for Common Blocks where this is neccesary), we will end up
6950 // with GEP's into this array inside the kernel, that refer to the Global
6951 // but are technically seperate arguments to the kernel for all intents and
6952 // purposes. If we have mapped a segment that requires a GEP into the 0-th
6953 // index, it will fold into an referal to the Global, if we then encounter
6954 // this folded GEP during replacement all of the references to the
6955 // Global in the kernel will be replaced with the argument we have generated
6956 // that corresponds to it, including any other GEP's that refer to the
6957 // Global that may be other arguments. This will invalidate all of the other
6958 // preceding mapped arguments that refer to the same global that may be
6959 // seperate segments. To prevent this, we defer global processing until all
6960 // other processing has been performed.
6961 if (llvm::isa<llvm::GlobalValue>(std::get<0>(InArg)) ||
6962 llvm::isa<llvm::GlobalObject>(std::get<0>(InArg)) ||
6963 llvm::isa<llvm::GlobalVariable>(std::get<0>(InArg))) {
6964 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
6965 continue;
6966 }
6967
6968 ReplaceValue(Input, InputCopy, Func);
6969 }
6970
6971 // Replace all of our deferred Input values, currently just Globals.
6972 for (auto Deferred : DeferredReplacement)
6973 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
6974
6975 return Func;
6976}
6977
6978/// Create an entry point for a target task with the following.
6979/// It'll have the following signature
6980/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
6981/// This function is called from emitTargetTask once the
6982/// code to launch the target kernel has been outlined already.
6984 IRBuilderBase &Builder,
6985 CallInst *StaleCI) {
6986 Module &M = OMPBuilder.M;
6987 // KernelLaunchFunction is the target launch function, i.e.
6988 // the function that sets up kernel arguments and calls
6989 // __tgt_target_kernel to launch the kernel on the device.
6990 //
6991 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
6992
6993 // StaleCI is the CallInst which is the call to the outlined
6994 // target kernel launch function. If there are values that the
6995 // outlined function uses then these are aggregated into a structure
6996 // which is passed as the second argument. If not, then there's
6997 // only one argument, the threadID. So, StaleCI can be
6998 //
6999 // %structArg = alloca { ptr, ptr }, align 8
7000 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
7001 // store ptr %20, ptr %gep_, align 8
7002 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
7003 // store ptr %21, ptr %gep_8, align 8
7004 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
7005 //
7006 // OR
7007 //
7008 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
7010 StaleCI->getIterator());
7011 LLVMContext &Ctx = StaleCI->getParent()->getContext();
7012 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
7013 Type *TaskPtrTy = OMPBuilder.TaskPtr;
7014 Type *TaskTy = OMPBuilder.Task;
7015 auto ProxyFnTy =
7016 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
7017 /* isVarArg */ false);
7018 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
7019 ".omp_target_task_proxy_func",
7020 Builder.GetInsertBlock()->getModule());
7021 ProxyFn->getArg(0)->setName("thread.id");
7022 ProxyFn->getArg(1)->setName("task");
7023
7024 BasicBlock *EntryBB =
7025 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
7026 Builder.SetInsertPoint(EntryBB);
7027
7028 bool HasShareds = StaleCI->arg_size() > 1;
7029 // TODO: This is a temporary assert to prove to ourselves that
7030 // the outlined target launch function is always going to have
7031 // atmost two arguments if there is any data shared between
7032 // host and device.
7033 assert((!HasShareds || (StaleCI->arg_size() == 2)) &&
7034 "StaleCI with shareds should have exactly two arguments.");
7035 if (HasShareds) {
7036 auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
7037 assert(ArgStructAlloca &&
7038 "Unable to find the alloca instruction corresponding to arguments "
7039 "for extracted function");
7040 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
7041
7042 AllocaInst *NewArgStructAlloca =
7043 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
7044 Value *TaskT = ProxyFn->getArg(1);
7045 Value *ThreadId = ProxyFn->getArg(0);
7046 Value *SharedsSize =
7047 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
7048
7049 Value *Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
7050 LoadInst *LoadShared =
7051 Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
7052
7053 Builder.CreateMemCpy(
7054 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
7055 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
7056
7057 Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca});
7058 }
7059 Builder.CreateRetVoid();
7060 return ProxyFn;
7061}
7062
7064 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
7065 TargetRegionEntryInfo &EntryInfo,
7067 Function *&OutlinedFn, Constant *&OutlinedFnID,
7071
7072 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
7073 [&](StringRef EntryFnName) {
7074 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
7075 EntryFnName, Inputs, CBFunc,
7076 ArgAccessorFuncCB);
7077 };
7078
7079 return OMPBuilder.emitTargetRegionFunction(
7080 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
7081 OutlinedFnID);
7082}
7083
7085 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
7088 bool HasNoWait) {
7089
7090 // The following explains the code-gen scenario for the `target` directive. A
7091 // similar scneario is followed for other device-related directives (e.g.
7092 // `target enter data`) but in similar fashion since we only need to emit task
7093 // that encapsulates the proper runtime call.
7094 //
7095 // When we arrive at this function, the target region itself has been
7096 // outlined into the function OutlinedFn.
7097 // So at ths point, for
7098 // --------------------------------------------------
7099 // void user_code_that_offloads(...) {
7100 // omp target depend(..) map(from:a) map(to:b, c)
7101 // a = b + c
7102 // }
7103 //
7104 // --------------------------------------------------
7105 //
7106 // we have
7107 //
7108 // --------------------------------------------------
7109 //
7110 // void user_code_that_offloads(...) {
7111 // %.offload_baseptrs = alloca [3 x ptr], align 8
7112 // %.offload_ptrs = alloca [3 x ptr], align 8
7113 // %.offload_mappers = alloca [3 x ptr], align 8
7114 // ;; target region has been outlined and now we need to
7115 // ;; offload to it via a target task.
7116 // }
7117 // void outlined_device_function(ptr a, ptr b, ptr c) {
7118 // *a = *b + *c
7119 // }
7120 //
7121 // We have to now do the following
7122 // (i) Make an offloading call to outlined_device_function using the OpenMP
7123 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
7124 // emitted by emitKernelLaunch
7125 // (ii) Create a task entry point function that calls kernel_launch_function
7126 // and is the entry point for the target task. See
7127 // '@.omp_target_task_proxy_func in the pseudocode below.
7128 // (iii) Create a task with the task entry point created in (ii)
7129 //
7130 // That is we create the following
7131 //
7132 // void user_code_that_offloads(...) {
7133 // %.offload_baseptrs = alloca [3 x ptr], align 8
7134 // %.offload_ptrs = alloca [3 x ptr], align 8
7135 // %.offload_mappers = alloca [3 x ptr], align 8
7136 //
7137 // %structArg = alloca { ptr, ptr, ptr }, align 8
7138 // %strucArg[0] = %.offload_baseptrs
7139 // %strucArg[1] = %.offload_ptrs
7140 // %strucArg[2] = %.offload_mappers
7141 // proxy_target_task = @__kmpc_omp_task_alloc(...,
7142 // @.omp_target_task_proxy_func)
7143 // memcpy(proxy_target_task->shareds, %structArg, sizeof(structArg))
7144 // dependencies_array = ...
7145 // ;; if nowait not present
7146 // call @__kmpc_omp_wait_deps(..., dependencies_array)
7147 // call @__kmpc_omp_task_begin_if0(...)
7148 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
7149 // %proxy_target_task) call @__kmpc_omp_task_complete_if0(...)
7150 // }
7151 //
7152 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
7153 // ptr %task) {
7154 // %structArg = alloca {ptr, ptr, ptr}
7155 // %shared_data = load (getelementptr %task, 0, 0)
7156 // mempcy(%structArg, %shared_data, sizeof(structArg))
7157 // kernel_launch_function(%thread.id, %structArg)
7158 // }
7159 //
7160 // We need the proxy function because the signature of the task entry point
7161 // expected by kmpc_omp_task is always the same and will be different from
7162 // that of the kernel_launch function.
7163 //
7164 // kernel_launch_function is generated by emitKernelLaunch and has the
7165 // always_inline attribute.
7166 // void kernel_launch_function(thread_id,
7167 // structArg) alwaysinline {
7168 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
7169 // offload_baseptrs = load(getelementptr structArg, 0, 0)
7170 // offload_ptrs = load(getelementptr structArg, 0, 1)
7171 // offload_mappers = load(getelementptr structArg, 0, 2)
7172 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
7173 // ; offload_mappers
7174 // call i32 @__tgt_target_kernel(...,
7175 // outlined_device_function,
7176 // ptr %kernel_args)
7177 // }
7178 // void outlined_device_function(ptr a, ptr b, ptr c) {
7179 // *a = *b + *c
7180 // }
7181 //
7182 BasicBlock *TargetTaskBodyBB =
7183 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
7184 BasicBlock *TargetTaskAllocaBB =
7185 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
7186
7187 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
7188 TargetTaskAllocaBB->begin());
7189 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
7190
7191 OutlineInfo OI;
7192 OI.EntryBB = TargetTaskAllocaBB;
7193 OI.OuterAllocaBB = AllocaIP.getBlock();
7194
7195 // Add the thread ID argument.
7198 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
7199
7200 Builder.restoreIP(TargetTaskBodyIP);
7201
7202 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
7203 return Err;
7204
7205 OI.ExitBB = Builder.saveIP().getBlock();
7206 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, HasNoWait,
7207 DeviceID](Function &OutlinedFn) mutable {
7208 assert(OutlinedFn.getNumUses() == 1 &&
7209 "there must be a single user for the outlined function");
7210
7211 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
7212 bool HasShareds = StaleCI->arg_size() > 1;
7213
7214 Function *ProxyFn = emitTargetTaskProxyFunction(*this, Builder, StaleCI);
7215
7216 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
7217 << "\n");
7218
7219 Builder.SetInsertPoint(StaleCI);
7220
7221 // Gather the arguments for emitting the runtime call.
7222 uint32_t SrcLocStrSize;
7223 Constant *SrcLocStr =
7225 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7226
7227 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
7228 //
7229 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
7230 // the DeviceID to the deferred task and also since
7231 // @__kmpc_omp_target_task_alloc creates an untied/async task.
7232 Function *TaskAllocFn =
7233 !HasNoWait ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
7235 OMPRTL___kmpc_omp_target_task_alloc);
7236
7237 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
7238 // call.
7239 Value *ThreadID = getOrCreateThreadID(Ident);
7240
7241 // Argument - `sizeof_kmp_task_t` (TaskSize)
7242 // Tasksize refers to the size in bytes of kmp_task_t data structure
7243 // including private vars accessed in task.
7244 // TODO: add kmp_task_t_with_privates (privates)
7245 Value *TaskSize =
7247
7248 // Argument - `sizeof_shareds` (SharedsSize)
7249 // SharedsSize refers to the shareds array size in the kmp_task_t data
7250 // structure.
7251 Value *SharedsSize = Builder.getInt64(0);
7252 if (HasShareds) {
7253 auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
7254 assert(ArgStructAlloca &&
7255 "Unable to find the alloca instruction corresponding to arguments "
7256 "for extracted function");
7257 auto *ArgStructType =
7258 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
7259 assert(ArgStructType && "Unable to find struct type corresponding to "
7260 "arguments for extracted function");
7261 SharedsSize =
7263 }
7264
7265 // Argument - `flags`
7266 // Task is tied iff (Flags & 1) == 1.
7267 // Task is untied iff (Flags & 1) == 0.
7268 // Task is final iff (Flags & 2) == 2.
7269 // Task is not final iff (Flags & 2) == 0.
7270 // A target task is not final and is untied.
7272
7273 // Emit the @__kmpc_omp_task_alloc runtime call
7274 // The runtime call returns a pointer to an area where the task captured
7275 // variables must be copied before the task is run (TaskData)
7276 CallInst *TaskData = nullptr;
7277
7278 SmallVector<llvm::Value *> TaskAllocArgs = {
7279 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
7280 /*flags=*/Flags,
7281 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
7282 /*task_func=*/ProxyFn};
7283
7284 if (HasNoWait)
7285 TaskAllocArgs.push_back(DeviceID);
7286
7287 TaskData = Builder.CreateCall(TaskAllocFn, TaskAllocArgs);
7288
7289 if (HasShareds) {
7290 Value *Shareds = StaleCI->getArgOperand(1);
7291 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
7292 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
7293 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
7294 SharedsSize);
7295 }
7296
7297 Value *DepArray = emitTaskDependencies(*this, Dependencies);
7298
7299 // ---------------------------------------------------------------
7300 // V5.2 13.8 target construct
7301 // If the nowait clause is present, execution of the target task
7302 // may be deferred. If the nowait clause is not present, the target task is
7303 // an included task.
7304 // ---------------------------------------------------------------
7305 // The above means that the lack of a nowait on the target construct
7306 // translates to '#pragma omp task if(0)'
7307 if (!HasNoWait) {
7308 if (DepArray) {
7309 Function *TaskWaitFn =
7310 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
7312 TaskWaitFn,
7313 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
7314 /*ndeps=*/Builder.getInt32(Dependencies.size()),
7315 /*dep_list=*/DepArray,
7316 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
7317 /*noalias_dep_list=*/
7319 }
7320 // Included task.
7321 Function *TaskBeginFn =
7322 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
7323 Function *TaskCompleteFn =
7324 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
7325 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
7326 CallInst *CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData});
7327 CI->setDebugLoc(StaleCI->getDebugLoc());
7328 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
7329 } else if (DepArray) {
7330 // HasNoWait - meaning the task may be deferred. Call
7331 // __kmpc_omp_task_with_deps if there are dependencies,
7332 // else call __kmpc_omp_task
7333 Function *TaskFn =
7334 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
7336 TaskFn,
7337 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
7338 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
7340 } else {
7341 // Emit the @__kmpc_omp_task runtime call to spawn the task
7342 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
7343 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
7344 }
7345
7346 StaleCI->eraseFromParent();
7347 for (Instruction *I : llvm::reverse(ToBeDeleted))
7348 I->eraseFromParent();
7349 };
7350 addOutlineInfo(std::move(OI));
7351
7352 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
7353 << *(Builder.GetInsertBlock()) << "\n");
7354 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
7356 << "\n");
7357 return Builder.saveIP();
7358}
7359
7361 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
7362 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, bool IsNonContiguous,
7363 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB,
7364 function_ref<Value *(unsigned int)> CustomMapperCB) {
7365 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info, IsNonContiguous,
7366 DeviceAddrCB, CustomMapperCB);
7367 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
7368}
7369
7370static void
7375 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
7379 bool HasNoWait = false) {
7380 // Generate a function call to the host fallback implementation of the target
7381 // region. This is called by the host when no offload entry was generated for
7382 // the target region and when the offloading call fails at runtime.
7383 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
7385 Builder.restoreIP(IP);
7386 Builder.CreateCall(OutlinedFn, Args);
7387 return Builder.saveIP();
7388 };
7389
7390 bool HasDependencies = Dependencies.size() > 0;
7391 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
7392
7394
7395 auto TaskBodyCB =
7396 [&](Value *DeviceID, Value *RTLoc,
7397 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
7398 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
7399 // produce any.
7401 // emitKernelLaunch makes the necessary runtime call to offload the
7402 // kernel. We then outline all that code into a separate function
7403 // ('kernel_launch_function' in the pseudo code above). This function is
7404 // then called by the target task proxy function (see
7405 // '@.omp_target_task_proxy_func' in the pseudo code above)
7406 // "@.omp_target_task_proxy_func' is generated by
7407 // emitTargetTaskProxyFunction.
7408 if (OutlinedFnID)
7409 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
7410 EmitTargetCallFallbackCB, KArgs,
7411 DeviceID, RTLoc, TargetTaskAllocaIP);
7412 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
7413 // In this case, we execute the host implementation directly.
7414 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
7415 }());
7416
7417 OMPBuilder.Builder.restoreIP(AfterIP);
7418 return Error::success();
7419 };
7420
7421 auto &&EmitTargetCallElse =
7422 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
7424 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
7425 // produce any.
7427 if (RequiresOuterTargetTask) {
7428 // Arguments that are intended to be directly forwarded to an
7429 // emitKernelLaunch call are pased as nullptr, since
7430 // OutlinedFnID=nullptr results in that call not being done.
7431 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
7432 /*RTLoc=*/nullptr, AllocaIP,
7433 Dependencies, HasNoWait);
7434 }
7435 return EmitTargetCallFallbackCB(Builder.saveIP());
7436 }());
7437
7438 Builder.restoreIP(AfterIP);
7439 return Error::success();
7440 };
7441
7442 auto &&EmitTargetCallThen =
7443 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
7446 /*RequiresDevicePointerInfo=*/false,
7447 /*SeparateBeginEndCalls=*/true);
7448
7449 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
7451 OMPBuilder.emitOffloadingArraysAndArgs(AllocaIP, Builder.saveIP(), Info,
7452 RTArgs, MapInfo,
7453 /*IsNonContiguous=*/true,
7454 /*ForEndCall=*/false);
7455
7456 SmallVector<Value *, 3> NumTeamsC;
7457 for (auto [DefaultVal, RuntimeVal] :
7458 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
7459 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
7460 : Builder.getInt32(DefaultVal));
7461
7462 // Calculate number of threads: 0 if no clauses specified, otherwise it is
7463 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
7464 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
7465 if (Clause)
7466 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
7467 /*isSigned=*/false);
7468 return Clause;
7469 };
7470 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
7471 if (Clause)
7472 Result =
7473 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
7474 Result, Clause)
7475 : Clause;
7476 };
7477
7478 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
7479 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
7480 SmallVector<Value *, 3> NumThreadsC;
7481 Value *MaxThreadsClause =
7482 RuntimeAttrs.TeamsThreadLimit.size() == 1
7483 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
7484 : nullptr;
7485
7486 for (auto [TeamsVal, TargetVal] : zip_equal(
7487 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
7488 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
7489 Value *NumThreads = InitMaxThreadsClause(TargetVal);
7490
7491 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
7492 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
7493
7494 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
7495 }
7496
7497 unsigned NumTargetItems = Info.NumberOfPtrs;
7498 // TODO: Use correct device ID
7499 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
7500 uint32_t SrcLocStrSize;
7501 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
7502 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
7503 llvm::omp::IdentFlag(0), 0);
7504
7505 Value *TripCount = RuntimeAttrs.LoopTripCount
7506 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
7507 Builder.getInt64Ty(),
7508 /*isSigned=*/false)
7509 : Builder.getInt64(0);
7510
7511 // TODO: Use correct DynCGGroupMem
7512 Value *DynCGGroupMem = Builder.getInt32(0);
7513
7514 KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount,
7515 NumTeamsC, NumThreadsC,
7516 DynCGGroupMem, HasNoWait);
7517
7518 // Assume no error was returned because TaskBodyCB and
7519 // EmitTargetCallFallbackCB don't produce any.
7521 // The presence of certain clauses on the target directive require the
7522 // explicit generation of the target task.
7523 if (RequiresOuterTargetTask)
7524 return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP,
7525 Dependencies, HasNoWait);
7526
7527 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
7528 EmitTargetCallFallbackCB, KArgs,
7529 DeviceID, RTLoc, AllocaIP);
7530 }());
7531
7532 Builder.restoreIP(AfterIP);
7533 return Error::success();
7534 };
7535
7536 // If we don't have an ID for the target region, it means an offload entry
7537 // wasn't created. In this case we just run the host fallback directly and
7538 // ignore any potential 'if' clauses.
7539 if (!OutlinedFnID) {
7540 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
7541 return;
7542 }
7543
7544 // If there's no 'if' clause, only generate the kernel launch code path.
7545 if (!IfCond) {
7546 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
7547 return;
7548 }
7549
7550 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
7551 EmitTargetCallElse, AllocaIP));
7552}
7553
7555 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
7556 InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo,
7557 const TargetKernelDefaultAttrs &DefaultAttrs,
7558 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
7562 SmallVector<DependData> Dependencies, bool HasNowait) {
7563
7564 if (!updateToLocation(Loc))
7565 return InsertPointTy();
7566
7567 Builder.restoreIP(CodeGenIP);
7568
7569 Function *OutlinedFn;
7570 Constant *OutlinedFnID = nullptr;
7571 // The target region is outlined into its own function. The LLVM IR for
7572 // the target region itself is generated using the callbacks CBFunc
7573 // and ArgAccessorFuncCB
7575 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
7576 OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB))
7577 return Err;
7578
7579 // If we are not on the target device, then we need to generate code
7580 // to make a remote call (offload) to the previously outlined function
7581 // that represents the target region. Do that now.
7582 if (!Config.isTargetDevice())
7583 emitTargetCall(*this, Builder, AllocaIP, DefaultAttrs, RuntimeAttrs, IfCond,
7584 OutlinedFn, OutlinedFnID, Args, GenMapInfoCB, Dependencies,
7585 HasNowait);
7586 return Builder.saveIP();
7587}
7588
7589std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
7590 StringRef FirstSeparator,
7591 StringRef Separator) {
7592 SmallString<128> Buffer;
7594 StringRef Sep = FirstSeparator;
7595 for (StringRef Part : Parts) {
7596 OS << Sep << Part;
7597 Sep = Separator;
7598 }
7599 return OS.str().str();
7600}
7601
7602std::string
7604 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
7605 Config.separator());
7606}
7607
7610 unsigned AddressSpace) {
7611 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
7612 if (Elem.second) {
7613 assert(Elem.second->getValueType() == Ty &&
7614 "OMP internal variable has different type than requested");
7615 } else {
7616 // TODO: investigate the appropriate linkage type used for the global
7617 // variable for possibly changing that to internal or private, or maybe
7618 // create different versions of the function for different OMP internal
7619 // variables.
7620 auto Linkage = this->M.getTargetTriple().rfind("wasm32") == 0
7623 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
7624 Constant::getNullValue(Ty), Elem.first(),
7625 /*InsertBefore=*/nullptr,
7627 const DataLayout &DL = M.getDataLayout();
7628 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
7629 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
7630 GV->setAlignment(std::max(TypeAlign, PtrAlign));
7631 Elem.second = GV;
7632 }
7633
7634 return Elem.second;
7635}
7636
7637Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
7638 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
7639 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
7640 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
7641}
7642
7645 Value *Null =
7646 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
7647 Value *SizeGep =
7648 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
7649 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
7650 return SizePtrToInt;
7651}
7652
7655 std::string VarName) {
7656 llvm::Constant *MaptypesArrayInit =
7658 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
7659 M, MaptypesArrayInit->getType(),
7660 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
7661 VarName);
7662 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
7663 return MaptypesArrayGlobal;
7664}
7665
7667 InsertPointTy AllocaIP,
7668 unsigned NumOperands,
7669 struct MapperAllocas &MapperAllocas) {
7670 if (!updateToLocation(Loc))
7671 return;
7672
7673 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
7674 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
7675 Builder.restoreIP(AllocaIP);
7676 AllocaInst *ArgsBase = Builder.CreateAlloca(
7677 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
7678 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
7679 ".offload_ptrs");
7680 AllocaInst *ArgSizes = Builder.CreateAlloca(
7681 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
7682 Builder.restoreIP(Loc.IP);
7683 MapperAllocas.ArgsBase = ArgsBase;
7684 MapperAllocas.Args = Args;
7685 MapperAllocas.ArgSizes = ArgSizes;
7686}
7687
7689 Function *MapperFunc, Value *SrcLocInfo,
7690 Value *MaptypesArg, Value *MapnamesArg,
7692 int64_t DeviceID, unsigned NumOperands) {
7693 if (!updateToLocation(Loc))
7694 return;
7695
7696 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
7697 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
7698 Value *ArgsBaseGEP =
7700 {Builder.getInt32(0), Builder.getInt32(0)});
7701 Value *ArgsGEP =
7703 {Builder.getInt32(0), Builder.getInt32(0)});
7704 Value *ArgSizesGEP =
7706 {Builder.getInt32(0), Builder.getInt32(0)});
7707 Value *NullPtr =
7708 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
7709 Builder.CreateCall(MapperFunc,
7710 {SrcLocInfo, Builder.getInt64(DeviceID),
7711 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP,
7712 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr});
7713}
7714
7716 TargetDataRTArgs &RTArgs,
7717 TargetDataInfo &Info,
7718 bool ForEndCall) {
7719 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
7720 "expected region end call to runtime only when end call is separate");
7721 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
7722 auto VoidPtrTy = UnqualPtrTy;
7723 auto VoidPtrPtrTy = UnqualPtrTy;
7724 auto Int64Ty = Type::getInt64Ty(M.getContext());
7725 auto Int64PtrTy = UnqualPtrTy;
7726
7727 if (!Info.NumberOfPtrs) {
7728 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7729 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7730 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
7731 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
7732 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
7733 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7734 return;
7735 }
7736
7738 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
7739 Info.RTArgs.BasePointersArray,
7740 /*Idx0=*/0, /*Idx1=*/0);
7742 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
7743 /*Idx0=*/0,
7744 /*Idx1=*/0);
7746 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
7747 /*Idx0=*/0, /*Idx1=*/0);
7749 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
7750 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
7751 : Info.RTArgs.MapTypesArray,
7752 /*Idx0=*/0,
7753 /*Idx1=*/0);
7754
7755 // Only emit the mapper information arrays if debug information is
7756 // requested.
7757 if (!Info.EmitDebug)
7758 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
7759 else
7761 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
7762 /*Idx0=*/0,
7763 /*Idx1=*/0);
7764 // If there is no user-defined mapper, set the mapper array to nullptr to
7765 // avoid an unnecessary data privatization
7766 if (!Info.HasMapper)
7767 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7768 else
7769 RTArgs.MappersArray =
7770 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
7771}
7772
7774 InsertPointTy CodeGenIP,
7775 MapInfosTy &CombinedInfo,
7776 TargetDataInfo &Info) {
7778 CombinedInfo.NonContigInfo;
7779
7780 // Build an array of struct descriptor_dim and then assign it to
7781 // offload_args.
7782 //
7783 // struct descriptor_dim {
7784 // uint64_t offset;
7785 // uint64_t count;
7786 // uint64_t stride
7787 // };
7788 Type *Int64Ty = Builder.getInt64Ty();
7790 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
7791 "struct.descriptor_dim");
7792
7793 enum { OffsetFD = 0, CountFD, StrideFD };
7794 // We need two index variable here since the size of "Dims" is the same as
7795 // the size of Components, however, the size of offset, count, and stride is
7796 // equal to the size of base declaration that is non-contiguous.
7797 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
7798 // Skip emitting ir if dimension size is 1 since it cannot be
7799 // non-contiguous.
7800 if (NonContigInfo.Dims[I] == 1)
7801 continue;
7802 Builder.restoreIP(AllocaIP);
7803 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
7804 AllocaInst *DimsAddr =
7805 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
7806 Builder.restoreIP(CodeGenIP);
7807 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
7808 unsigned RevIdx = EE - II - 1;
7809 Value *DimsLVal = Builder.CreateInBoundsGEP(
7810 DimsAddr->getAllocatedType(), DimsAddr,
7811 {Builder.getInt64(0), Builder.getInt64(II)});
7812 // Offset
7813 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
7815 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
7816 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
7817 // Count
7818 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
7820 NonContigInfo.Counts[L][RevIdx], CountLVal,
7821 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
7822 // Stride
7823 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
7825 NonContigInfo.Strides[L][RevIdx], StrideLVal,
7826 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
7827 }
7828 // args[I] = &dims
7829 Builder.restoreIP(CodeGenIP);
7831 DimsAddr, Builder.getPtrTy());
7833 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
7834 Info.RTArgs.PointersArray, 0, I);
7837 ++L;
7838 }
7839}
7840
7841void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
7842 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
7843 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
7844 BasicBlock *ExitBB, bool IsInit) {
7845 StringRef Prefix = IsInit ? ".init" : ".del";
7846
7847 // Evaluate if this is an array section.
7849 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
7850 Value *IsArray =
7851 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
7852 Value *DeleteBit = Builder.CreateAnd(
7853 MapType,
7855 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7856 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
7857 Value *DeleteCond;
7858 Value *Cond;
7859 if (IsInit) {
7860 // base != begin?
7861 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
7862 // IsPtrAndObj?
7863 Value *PtrAndObjBit = Builder.CreateAnd(
7864 MapType,
7866 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7867 OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ)));
7868 PtrAndObjBit = Builder.CreateIsNotNull(PtrAndObjBit);
7869 BaseIsBegin = Builder.CreateAnd(BaseIsBegin, PtrAndObjBit);
7870 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
7871 DeleteCond = Builder.CreateIsNull(
7872 DeleteBit,
7873 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
7874 } else {
7875 Cond = IsArray;
7876 DeleteCond = Builder.CreateIsNotNull(
7877 DeleteBit,
7878 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
7879 }
7880 Cond = Builder.CreateAnd(Cond, DeleteCond);
7881 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
7882
7883 emitBlock(BodyBB, MapperFn);
7884 // Get the array size by multiplying element size and element number (i.e., \p
7885 // Size).
7886 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
7887 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
7888 // memory allocation/deletion purpose only.
7889 Value *MapTypeArg = Builder.CreateAnd(
7890 MapType,
7892 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7893 OpenMPOffloadMappingFlags::OMP_MAP_TO |
7894 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
7895 MapTypeArg = Builder.CreateOr(
7896 MapTypeArg,
7898 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7899 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
7900
7901 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
7902 // data structure.
7903 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
7904 ArraySize, MapTypeArg, MapName};
7906 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
7907 OffloadingArgs);
7908}
7909
7911 function_ref<MapInfosTy &(InsertPointTy CodeGenIP, llvm::Value *PtrPHI,
7912 llvm::Value *BeginArg)>
7913 GenMapInfoCB,
7914 Type *ElemTy, StringRef FuncName,
7915 function_ref<bool(unsigned int, Function **)> CustomMapperCB) {
7916 SmallVector<Type *> Params;
7917 Params.emplace_back(Builder.getPtrTy());
7918 Params.emplace_back(Builder.getPtrTy());
7919 Params.emplace_back(Builder.getPtrTy());
7922 Params.emplace_back(Builder.getPtrTy());
7923
7924 auto *FnTy =
7925 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
7926
7927 SmallString<64> TyStr;
7928 raw_svector_ostream Out(TyStr);
7929 Function *MapperFn =
7931 MapperFn->addFnAttr(Attribute::NoInline);
7932 MapperFn->addFnAttr(Attribute::NoUnwind);
7933 MapperFn->addParamAttr(0, Attribute::NoUndef);
7934 MapperFn->addParamAttr(1, Attribute::NoUndef);
7935 MapperFn->addParamAttr(2, Attribute::NoUndef);
7936 MapperFn->addParamAttr(3, Attribute::NoUndef);
7937 MapperFn->addParamAttr(4, Attribute::NoUndef);
7938 MapperFn->addParamAttr(5, Attribute::NoUndef);
7939
7940 // Start the mapper function code generation.
7941 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
7942 auto SavedIP = Builder.saveIP();
7943 Builder.SetInsertPoint(EntryBB);
7944
7945 Value *MapperHandle = MapperFn->getArg(0);
7946 Value *BaseIn = MapperFn->getArg(1);
7947 Value *BeginIn = MapperFn->getArg(2);
7948 Value *Size = MapperFn->getArg(3);
7949 Value *MapType = MapperFn->getArg(4);
7950 Value *MapName = MapperFn->getArg(5);
7951
7952 // Compute the starting and end addresses of array elements.
7953 // Prepare common arguments for array initiation and deletion.
7954 // Convert the size in bytes into the number of array elements.
7955 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
7957 Value *PtrBegin = Builder.CreateBitCast(BeginIn, Builder.getPtrTy());
7958 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
7959
7960 // Emit array initiation if this is an array section and \p MapType indicates
7961 // that memory allocation is required.
7962 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
7963 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
7964 MapType, MapName, ElementSize, HeadBB,
7965 /*IsInit=*/true);
7966
7967 // Emit a for loop to iterate through SizeArg of elements and map all of them.
7968
7969 // Emit the loop header block.
7970 emitBlock(HeadBB, MapperFn);
7971 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
7972 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
7973 // Evaluate whether the initial condition is satisfied.
7974 Value *IsEmpty =
7975 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
7976 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
7977
7978 // Emit the loop body block.
7979 emitBlock(BodyBB, MapperFn);
7980 BasicBlock *LastBB = BodyBB;
7981 PHINode *PtrPHI =
7982 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
7983 PtrPHI->addIncoming(PtrBegin, HeadBB);
7984
7985 // Get map clause information. Fill up the arrays with all mapped variables.
7986 MapInfosTy &Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
7987
7988 // Call the runtime API __tgt_mapper_num_components to get the number of
7989 // pre-existing components.
7990 Value *OffloadingArgs[] = {MapperHandle};
7991 Value *PreviousSize = Builder.CreateCall(
7992 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
7993 OffloadingArgs);
7994 Value *ShiftedPreviousSize =
7996
7997 // Fill up the runtime mapper handle for all components.
7998 for (unsigned I = 0; I < Info.BasePointers.size(); ++I) {
7999 Value *CurBaseArg =
8000 Builder.CreateBitCast(Info.BasePointers[I], Builder.getPtrTy());
8001 Value *CurBeginArg =
8003 Value *CurSizeArg = Info.Sizes[I];
8004 Value *CurNameArg = Info.Names.size()
8005 ? Info.Names[I]
8007
8008 // Extract the MEMBER_OF field from the map type.
8009 Value *OriMapType = Builder.getInt64(
8010 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8011 Info.Types[I]));
8012 Value *MemberMapType =
8013 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
8014
8015 // Combine the map type inherited from user-defined mapper with that
8016 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
8017 // bits of the \a MapType, which is the input argument of the mapper
8018 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
8019 // bits of MemberMapType.
8020 // [OpenMP 5.0], 1.2.6. map-type decay.
8021 // | alloc | to | from | tofrom | release | delete
8022 // ----------------------------------------------------------
8023 // alloc | alloc | alloc | alloc | alloc | release | delete
8024 // to | alloc | to | alloc | to | release | delete
8025 // from | alloc | alloc | from | from | release | delete
8026 // tofrom | alloc | to | from | tofrom | release | delete
8027 Value *LeftToFrom = Builder.CreateAnd(
8028 MapType,
8030 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8031 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8032 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8033 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
8034 BasicBlock *AllocElseBB =
8035 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
8036 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
8037 BasicBlock *ToElseBB =
8038 BasicBlock::Create(M.getContext(), "omp.type.to.else");
8039 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
8040 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
8041 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
8042 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
8043 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
8044 emitBlock(AllocBB, MapperFn);
8045 Value *AllocMapType = Builder.CreateAnd(
8046 MemberMapType,
8048 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8049 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8050 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8051 Builder.CreateBr(EndBB);
8052 emitBlock(AllocElseBB, MapperFn);
8053 Value *IsTo = Builder.CreateICmpEQ(
8054 LeftToFrom,
8056 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8057 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8058 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
8059 // In case of to, clear OMP_MAP_FROM.
8060 emitBlock(ToBB, MapperFn);
8061 Value *ToMapType = Builder.CreateAnd(
8062 MemberMapType,
8064 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8065 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8066 Builder.CreateBr(EndBB);
8067 emitBlock(ToElseBB, MapperFn);
8068 Value *IsFrom = Builder.CreateICmpEQ(
8069 LeftToFrom,
8071 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8072 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8073 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
8074 // In case of from, clear OMP_MAP_TO.
8075 emitBlock(FromBB, MapperFn);
8076 Value *FromMapType = Builder.CreateAnd(
8077 MemberMapType,
8079 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8080 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8081 // In case of tofrom, do nothing.
8082 emitBlock(EndBB, MapperFn);
8083 LastBB = EndBB;
8084 PHINode *CurMapType =
8085 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
8086 CurMapType->addIncoming(AllocMapType, AllocBB);
8087 CurMapType->addIncoming(ToMapType, ToBB);
8088 CurMapType->addIncoming(FromMapType, FromBB);
8089 CurMapType->addIncoming(MemberMapType, ToElseBB);
8090
8091 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
8092 CurSizeArg, CurMapType, CurNameArg};
8093 Function *ChildMapperFn = nullptr;
8094 if (CustomMapperCB && CustomMapperCB(I, &ChildMapperFn)) {
8095 // Call the corresponding mapper function.
8096 Builder.CreateCall(ChildMapperFn, OffloadingArgs)->setDoesNotThrow();
8097 } else {
8098 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8099 // data structure.
8101 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8102 OffloadingArgs);
8103 }
8104 }
8105
8106 // Update the pointer to point to the next element that needs to be mapped,
8107 // and check whether we have mapped all elements.
8108 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
8109 "omp.arraymap.next");
8110 PtrPHI->addIncoming(PtrNext, LastBB);
8111 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
8112 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
8113 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
8114
8115 emitBlock(ExitBB, MapperFn);
8116 // Emit array deletion if this is an array section and \p MapType indicates
8117 // that deletion is required.
8118 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8119 MapType, MapName, ElementSize, DoneBB,
8120 /*IsInit=*/false);
8121
8122 // Emit the function exit block.
8123 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
8124
8126 Builder.restoreIP(SavedIP);
8127 return MapperFn;
8128}
8129
8131 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
8132 TargetDataInfo &Info, bool IsNonContiguous,
8133 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
8134 function_ref<Value *(unsigned int)> CustomMapperCB) {
8135
8136 // Reset the array information.
8137 Info.clearArrayInfo();
8138 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
8139
8140 if (Info.NumberOfPtrs == 0)
8141 return;
8142
8143 Builder.restoreIP(AllocaIP);
8144 // Detect if we have any capture size requiring runtime evaluation of the
8145 // size so that a constant array could be eventually used.
8146 ArrayType *PointerArrayType =
8147 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
8148
8149 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
8150 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
8151
8152 Info.RTArgs.PointersArray = Builder.CreateAlloca(
8153 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
8154 AllocaInst *MappersArray = Builder.CreateAlloca(
8155 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
8156 Info.RTArgs.MappersArray = MappersArray;
8157
8158 // If we don't have any VLA types or other types that require runtime
8159 // evaluation, we can use a constant array for the map sizes, otherwise we
8160 // need to fill up the arrays as we do for the pointers.
8161 Type *Int64Ty = Builder.getInt64Ty();
8162 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
8163 ConstantInt::get(Int64Ty, 0));
8164 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
8165 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
8166 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
8167 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
8168 if (IsNonContiguous &&
8169 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8170 CombinedInfo.Types[I] &
8171 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
8172 ConstSizes[I] =
8173 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
8174 else
8175 ConstSizes[I] = CI;
8176 continue;
8177 }
8178 }
8179 RuntimeSizes.set(I);
8180 }
8181
8182 if (RuntimeSizes.all()) {
8183 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
8184 Info.RTArgs.SizesArray = Builder.CreateAlloca(
8185 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
8186 Builder.restoreIP(CodeGenIP);
8187 } else {
8188 auto *SizesArrayInit = ConstantArray::get(
8189 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
8190 std::string Name = createPlatformSpecificName({"offload_sizes"});
8191 auto *SizesArrayGbl =
8192 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
8193 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
8194 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
8195
8196 if (!RuntimeSizes.any()) {
8197 Info.RTArgs.SizesArray = SizesArrayGbl;
8198 } else {
8199 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
8200 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
8201 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
8203 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
8204 Buffer->setAlignment(OffloadSizeAlign);
8205 Builder.restoreIP(CodeGenIP);
8207 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
8208 SizesArrayGbl, OffloadSizeAlign,
8210 IndexSize,
8211 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
8212
8213 Info.RTArgs.SizesArray = Buffer;
8214 }
8215 Builder.restoreIP(CodeGenIP);
8216 }
8217
8218 // The map types are always constant so we don't need to generate code to
8219 // fill arrays. Instead, we create an array constant.
8221 for (auto mapFlag : CombinedInfo.Types)
8222 Mapping.push_back(
8223 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8224 mapFlag));
8225 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
8226 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
8227 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
8228
8229 // The information types are only built if provided.
8230 if (!CombinedInfo.Names.empty()) {
8231 std::string MapnamesName = createPlatformSpecificName({"offload_mapnames"});
8232 auto *MapNamesArrayGbl =
8233 createOffloadMapnames(CombinedInfo.Names, MapnamesName);
8234 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
8235 Info.EmitDebug = true;
8236 } else {
8237 Info.RTArgs.MapNamesArray =
8239 Info.EmitDebug = false;
8240 }
8241
8242 // If there's a present map type modifier, it must not be applied to the end
8243 // of a region, so generate a separate map type array in that case.
8244 if (Info.separateBeginEndCalls()) {
8245 bool EndMapTypesDiffer = false;
8246 for (uint64_t &Type : Mapping) {
8247 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8248 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
8249 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8250 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
8251 EndMapTypesDiffer = true;
8252 }
8253 }
8254 if (EndMapTypesDiffer) {
8255 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
8256 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
8257 }
8258 }
8259
8260 PointerType *PtrTy = Builder.getPtrTy();
8261 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
8262 Value *BPVal = CombinedInfo.BasePointers[I];
8264 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
8265 0, I);
8266 Builder.CreateAlignedStore(BPVal, BP,
8268
8269 if (Info.requiresDevicePointerInfo()) {
8270 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
8271 CodeGenIP = Builder.saveIP();
8272 Builder.restoreIP(AllocaIP);
8273 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
8274 Builder.restoreIP(CodeGenIP);
8275 if (DeviceAddrCB)
8276 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
8277 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
8278 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
8279 if (DeviceAddrCB)
8280 DeviceAddrCB(I, BP);
8281 }
8282 }
8283
8284 Value *PVal = CombinedInfo.Pointers[I];
8286 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
8287 I);
8288 // TODO: Check alignment correct.
8291
8292 if (RuntimeSizes.test(I)) {
8294 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
8295 /*Idx0=*/0,
8296 /*Idx1=*/I);
8298 Int64Ty,
8299 /*isSigned=*/true),
8300 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
8301 }
8302 // Fill up the mapper array.
8303 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
8304 Value *MFunc = ConstantPointerNull::get(PtrTy);
8305 if (CustomMapperCB)
8306 if (Value *CustomMFunc = CustomMapperCB(I))
8307 MFunc = Builder.CreatePointerCast(CustomMFunc, PtrTy);
8309 MappersArray->getAllocatedType(), MappersArray,
8310 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
8312 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
8313 }
8314
8315 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
8316 Info.NumberOfPtrs == 0)
8317 return;
8318 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
8319}
8320
8323
8324 if (!CurBB || CurBB->getTerminator()) {
8325 // If there is no insert point or the previous block is already
8326 // terminated, don't touch it.
8327 } else {
8328 // Otherwise, create a fall-through branch.
8330 }
8331
8333}
8334
8336 bool IsFinished) {
8338
8339 // Fall out of the current block (if necessary).
8340 emitBranch(BB);
8341
8342 if (IsFinished && BB->use_empty()) {
8343 BB->eraseFromParent();
8344 return;
8345 }
8346
8347 // Place the block after the current block, if possible, or else at
8348 // the end of the function.
8349 if (CurBB && CurBB->getParent())
8350 CurFn->insert(std::next(CurBB->getIterator()), BB);
8351 else
8352 CurFn->insert(CurFn->end(), BB);
8354}
8355
8357 BodyGenCallbackTy ElseGen,
8358 InsertPointTy AllocaIP) {
8359 // If the condition constant folds and can be elided, try to avoid emitting
8360 // the condition and the dead arm of the if/else.
8361 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
8362 auto CondConstant = CI->getSExtValue();
8363 if (CondConstant)
8364 return ThenGen(AllocaIP, Builder.saveIP());
8365
8366 return ElseGen(AllocaIP, Builder.saveIP());
8367 }
8368
8370
8371 // Otherwise, the condition did not fold, or we couldn't elide it. Just
8372 // emit the conditional branch.
8373 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
8374 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
8375 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
8376 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
8377 // Emit the 'then' code.
8378 emitBlock(ThenBlock, CurFn);
8379 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
8380 return Err;
8381 emitBranch(ContBlock);
8382 // Emit the 'else' code if present.
8383 // There is no need to emit line number for unconditional branch.
8384 emitBlock(ElseBlock, CurFn);
8385 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
8386 return Err;
8387 // There is no need to emit line number for unconditional branch.
8388 emitBranch(ContBlock);
8389 // Emit the continuation block for code after the if.
8390 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
8391 return Error::success();
8392}
8393
8394bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
8395 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
8398 "Unexpected Atomic Ordering.");
8399
8400 bool Flush = false;
8402
8403 switch (AK) {
8404 case Read:
8407 FlushAO = AtomicOrdering::Acquire;
8408 Flush = true;
8409 }
8410 break;
8411 case Write:
8412 case Compare:
8413 case Update:
8416 FlushAO = AtomicOrdering::Release;
8417 Flush = true;
8418 }
8419 break;
8420 case Capture:
8421 switch (AO) {
8423 FlushAO = AtomicOrdering::Acquire;
8424 Flush = true;
8425 break;
8427 FlushAO = AtomicOrdering::Release;
8428 Flush = true;
8429 break;
8433 Flush = true;
8434 break;
8435 default:
8436 // do nothing - leave silently.
8437 break;
8438 }
8439 }
8440
8441 if (Flush) {
8442 // Currently Flush RT call still doesn't take memory_ordering, so for when
8443 // that happens, this tries to do the resolution of which atomic ordering
8444 // to use with but issue the flush call
8445 // TODO: pass `FlushAO` after memory ordering support is added
8446 (void)FlushAO;
8447 emitFlush(Loc);
8448 }
8449
8450 // for AO == AtomicOrdering::Monotonic and all other case combinations
8451 // do nothing
8452 return Flush;
8453}
8454
8458 AtomicOrdering AO) {
8459 if (!updateToLocation(Loc))
8460 return Loc.IP;
8461
8462 assert(X.Var->getType()->isPointerTy() &&
8463 "OMP Atomic expects a pointer to target memory");
8464 Type *XElemTy = X.ElemTy;
8465 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8466 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
8467 "OMP atomic read expected a scalar type");
8468
8469 Value *XRead = nullptr;
8470
8471 if (XElemTy->isIntegerTy()) {
8472 LoadInst *XLD =
8473 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
8474 XLD->setAtomic(AO);
8475 XRead = cast<Value>(XLD);
8476 } else if (XElemTy->isStructTy()) {
8477 // FIXME: Add checks to ensure __atomic_load is emitted iff the
8478 // target does not support `atomicrmw` of the size of the struct
8479 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
8480 OldVal->setAtomic(AO);
8481 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
8482 unsigned LoadSize =
8483 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
8484 OpenMPIRBuilder::AtomicInfo atomicInfo(
8485 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
8486 OldVal->getAlign(), true /* UseLibcall */, X.Var);
8487 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
8488 XRead = AtomicLoadRes.first;
8489 OldVal->eraseFromParent();
8490 } else {
8491 // We need to perform atomic op as integer
8492 IntegerType *IntCastTy =
8494 LoadInst *XLoad =
8495 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
8496 XLoad->setAtomic(AO);
8497 if (XElemTy->isFloatingPointTy()) {
8498 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
8499 } else {
8500 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
8501 }
8502 }
8503 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
8504 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
8505 return Builder.saveIP();
8506}
8507
8510 AtomicOpValue &X, Value *Expr,
8511 AtomicOrdering AO) {
8512 if (!updateToLocation(Loc))
8513 return Loc.IP;
8514
8515 assert(X.Var->getType()->isPointerTy() &&
8516 "OMP Atomic expects a pointer to target memory");
8517 Type *XElemTy = X.ElemTy;
8518 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8519 XElemTy->isPointerTy()) &&
8520 "OMP atomic write expected a scalar type");
8521
8522 if (XElemTy->isIntegerTy()) {
8523 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
8524 XSt->setAtomic(AO);
8525 } else {
8526 // We need to bitcast and perform atomic op as integers
8527 IntegerType *IntCastTy =
8529 Value *ExprCast =
8530 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
8531 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
8532 XSt->setAtomic(AO);
8533 }
8534
8535 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
8536 return Builder.saveIP();
8537}
8538
8540 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
8541 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
8542 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr) {
8543 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
8544 if (!updateToLocation(Loc))
8545 return Loc.IP;
8546
8547 LLVM_DEBUG({
8548 Type *XTy = X.Var->getType();
8549 assert(XTy->isPointerTy() &&
8550 "OMP Atomic expects a pointer to target memory");
8551 Type *XElemTy = X.ElemTy;
8552 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8553 XElemTy->isPointerTy()) &&
8554 "OMP atomic update expected a scalar type");
8555 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
8556 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
8557 "OpenMP atomic does not support LT or GT operations");
8558 });
8559
8561 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp,
8562 X.IsVolatile, IsXBinopExpr);
8563 if (!AtomicResult)
8564 return AtomicResult.takeError();
8565 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
8566 return Builder.saveIP();
8567}
8568
8569// FIXME: Duplicating AtomicExpand
8570Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
8571 AtomicRMWInst::BinOp RMWOp) {
8572 switch (RMWOp) {
8573 case AtomicRMWInst::Add:
8574 return Builder.CreateAdd(Src1, Src2);
8575 case AtomicRMWInst::Sub:
8576 return Builder.CreateSub(Src1, Src2);
8577 case AtomicRMWInst::And:
8578 return Builder.CreateAnd(Src1, Src2);
8580 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
8581 case AtomicRMWInst::Or:
8582 return Builder.CreateOr(Src1, Src2);
8583 case AtomicRMWInst::Xor:
8584 return Builder.CreateXor(Src1, Src2);
8589 case AtomicRMWInst::Max:
8590 case AtomicRMWInst::Min:
8599 llvm_unreachable("Unsupported atomic update operation");
8600 }
8601 llvm_unreachable("Unsupported atomic update operation");
8602}
8603
8604Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
8605 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
8607 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr) {
8608 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
8609 // or a complex datatype.
8610 bool emitRMWOp = false;
8611 switch (RMWOp) {
8612 case AtomicRMWInst::Add:
8613 case AtomicRMWInst::And:
8615 case AtomicRMWInst::Or:
8616 case AtomicRMWInst::Xor:
8618 emitRMWOp = XElemTy;
8619 break;
8620 case AtomicRMWInst::Sub:
8621 emitRMWOp = (IsXBinopExpr && XElemTy);
8622 break;
8623 default:
8624 emitRMWOp = false;
8625 }
8626 emitRMWOp &= XElemTy->isIntegerTy();
8627
8628 std::pair<Value *, Value *> Res;
8629 if (emitRMWOp) {
8630 Res.first = Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
8631 // not needed except in case of postfix captures. Generate anyway for
8632 // consistency with the else part. Will be removed with any DCE pass.
8633 // AtomicRMWInst::Xchg does not have a coressponding instruction.
8634 if (RMWOp == AtomicRMWInst::Xchg)
8635 Res.second = Res.first;
8636 else
8637 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
8638 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
8639 XElemTy->isStructTy()) {
8640 LoadInst *OldVal =
8641 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
8642 OldVal->setAtomic(AO);
8643 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
8644 unsigned LoadSize =
8645 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
8646
8647 OpenMPIRBuilder::AtomicInfo atomicInfo(
8648 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
8649 OldVal->getAlign(), true /* UseLibcall */, X);
8650 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
8652 Instruction *CurBBTI = CurBB->getTerminator();
8653 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8654 BasicBlock *ExitBB =
8655 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
8656 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
8657 X->getName() + ".atomic.cont");
8658 ContBB->getTerminator()->eraseFromParent();
8659 Builder.restoreIP(AllocaIP);
8660 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
8661 NewAtomicAddr->setName(X->getName() + "x.new.val");
8662 Builder.SetInsertPoint(ContBB);
8663 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
8664 PHI->addIncoming(AtomicLoadRes.first, CurBB);
8665 Value *OldExprVal = PHI;
8666 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
8667 if (!CBResult)
8668 return CBResult.takeError();
8669 Value *Upd = *CBResult;
8670 Builder.CreateStore(Upd, NewAtomicAddr);
8673 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
8674 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
8675 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
8676 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
8677 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
8678 OldVal->eraseFromParent();
8679 Res.first = OldExprVal;
8680 Res.second = Upd;
8681
8682 if (UnreachableInst *ExitTI =
8683 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8684 CurBBTI->eraseFromParent();
8685 Builder.SetInsertPoint(ExitBB);
8686 } else {
8687 Builder.SetInsertPoint(ExitTI);
8688 }
8689 } else {
8690 IntegerType *IntCastTy =
8692 LoadInst *OldVal =
8693 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
8694 OldVal->setAtomic(AO);
8695 // CurBB
8696 // | /---\
8697 // ContBB |
8698 // | \---/
8699 // ExitBB
8701 Instruction *CurBBTI = CurBB->getTerminator();
8702 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8703 BasicBlock *ExitBB =
8704 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
8705 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
8706 X->getName() + ".atomic.cont");
8707 ContBB->getTerminator()->eraseFromParent();
8708 Builder.restoreIP(AllocaIP);
8709 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
8710 NewAtomicAddr->setName(X->getName() + "x.new.val");
8711 Builder.SetInsertPoint(ContBB);
8712 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
8713 PHI->addIncoming(OldVal, CurBB);
8714 bool IsIntTy = XElemTy->isIntegerTy();
8715 Value *OldExprVal = PHI;
8716 if (!IsIntTy) {
8717 if (XElemTy->isFloatingPointTy()) {
8718 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
8719 X->getName() + ".atomic.fltCast");
8720 } else {
8721 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
8722 X->getName() + ".atomic.ptrCast");
8723 }
8724 }
8725
8726 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
8727 if (!CBResult)
8728 return CBResult.takeError();
8729 Value *Upd = *CBResult;
8730 Builder.CreateStore(Upd, NewAtomicAddr);
8731 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
8735 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
8736 Result->setVolatile(VolatileX);
8737 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
8738 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8739 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
8740 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
8741
8742 Res.first = OldExprVal;
8743 Res.second = Upd;
8744
8745 // set Insertion point in exit block
8746 if (UnreachableInst *ExitTI =
8747 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8748 CurBBTI->eraseFromParent();
8749 Builder.SetInsertPoint(ExitBB);
8750 } else {
8751 Builder.SetInsertPoint(ExitTI);
8752 }
8753 }
8754
8755 return Res;
8756}
8757
8759 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
8760 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
8762 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr) {
8763 if (!updateToLocation(Loc))
8764 return Loc.IP;
8765
8766 LLVM_DEBUG({
8767 Type *XTy = X.Var->getType();
8768 assert(XTy->isPointerTy() &&
8769 "OMP Atomic expects a pointer to target memory");
8770 Type *XElemTy = X.ElemTy;
8771 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8772 XElemTy->isPointerTy()) &&
8773 "OMP atomic capture expected a scalar type");
8774 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
8775 "OpenMP atomic does not support LT or GT operations");
8776 });
8777
8778 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
8779 // 'x' is simply atomically rewritten with 'expr'.
8780 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
8782 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp,
8783 X.IsVolatile, IsXBinopExpr);
8784 if (!AtomicResult)
8785 return AtomicResult.takeError();
8786 Value *CapturedVal =
8787 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
8788 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
8789
8790 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
8791 return Builder.saveIP();
8792}
8793
8797 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
8798 bool IsFailOnly) {
8799
8801 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
8802 IsPostfixUpdate, IsFailOnly, Failure);
8803}
8804
8808 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
8809 bool IsFailOnly, AtomicOrdering Failure) {
8810
8811 if (!updateToLocation(Loc))
8812 return Loc.IP;
8813
8814 assert(X.Var->getType()->isPointerTy() &&
8815 "OMP atomic expects a pointer to target memory");
8816 // compare capture
8817 if (V.Var) {
8818 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
8819 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
8820 }
8821
8822 bool IsInteger = E->getType()->isIntegerTy();
8823
8824 if (Op == OMPAtomicCompareOp::EQ) {
8825 AtomicCmpXchgInst *Result = nullptr;
8826 if (!IsInteger) {
8827 IntegerType *IntCastTy =
8828 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
8829 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
8830 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
8831 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
8832 AO, Failure);
8833 } else {
8834 Result =
8835 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
8836 }
8837
8838 if (V.Var) {
8839 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
8840 if (!IsInteger)
8841 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
8842 assert(OldValue->getType() == V.ElemTy &&
8843 "OldValue and V must be of same type");
8844 if (IsPostfixUpdate) {
8845 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
8846 } else {
8847 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8848 if (IsFailOnly) {
8849 // CurBB----
8850 // | |
8851 // v |
8852 // ContBB |
8853 // | |
8854 // v |
8855 // ExitBB <-
8856 //
8857 // where ContBB only contains the store of old value to 'v'.
8859 Instruction *CurBBTI = CurBB->getTerminator();
8860 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8861 BasicBlock *ExitBB = CurBB->splitBasicBlock(
8862 CurBBTI, X.Var->getName() + ".atomic.exit");
8863 BasicBlock *ContBB = CurBB->splitBasicBlock(
8864 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
8865 ContBB->getTerminator()->eraseFromParent();
8866 CurBB->getTerminator()->eraseFromParent();
8867
8868 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
8869
8870 Builder.SetInsertPoint(ContBB);
8871 Builder.CreateStore(OldValue, V.Var);
8872 Builder.CreateBr(ExitBB);
8873
8874 if (UnreachableInst *ExitTI =
8875 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8876 CurBBTI->eraseFromParent();
8877 Builder.SetInsertPoint(ExitBB);
8878 } else {
8879 Builder.SetInsertPoint(ExitTI);
8880 }
8881 } else {
8882 Value *CapturedValue =
8883 Builder.CreateSelect(SuccessOrFail, E, OldValue);
8884 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
8885 }
8886 }
8887 }
8888 // The comparison result has to be stored.
8889 if (R.Var) {
8890 assert(R.Var->getType()->isPointerTy() &&
8891 "r.var must be of pointer type");
8892 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
8893
8894 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8895 Value *ResultCast = R.IsSigned
8896 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
8897 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
8898 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
8899 }
8900 } else {
8901 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
8902 "Op should be either max or min at this point");
8903 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
8904
8905 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
8906 // Let's take max as example.
8907 // OpenMP form:
8908 // x = x > expr ? expr : x;
8909 // LLVM form:
8910 // *ptr = *ptr > val ? *ptr : val;
8911 // We need to transform to LLVM form.
8912 // x = x <= expr ? x : expr;
8914 if (IsXBinopExpr) {
8915 if (IsInteger) {
8916 if (X.IsSigned)
8917 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
8919 else
8920 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
8922 } else {
8923 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
8925 }
8926 } else {
8927 if (IsInteger) {
8928 if (X.IsSigned)
8929 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
8931 else
8932 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
8934 } else {
8935 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
8937 }
8938 }
8939
8940 AtomicRMWInst *OldValue =
8941 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
8942 if (V.Var) {
8943 Value *CapturedValue = nullptr;
8944 if (IsPostfixUpdate) {
8945 CapturedValue = OldValue;
8946 } else {
8947 CmpInst::Predicate Pred;
8948 switch (NewOp) {
8949 case AtomicRMWInst::Max:
8950 Pred = CmpInst::ICMP_SGT;
8951 break;
8953 Pred = CmpInst::ICMP_UGT;
8954 break;
8956 Pred = CmpInst::FCMP_OGT;
8957 break;
8958 case AtomicRMWInst::Min:
8959 Pred = CmpInst::ICMP_SLT;
8960 break;
8962 Pred = CmpInst::ICMP_ULT;
8963 break;
8965 Pred = CmpInst::FCMP_OLT;
8966 break;
8967 default:
8968 llvm_unreachable("unexpected comparison op");
8969 }
8970 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
8971 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
8972 }
8973 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
8974 }
8975 }
8976
8977 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
8978
8979 return Builder.saveIP();
8980}
8981
8984 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
8985 Value *NumTeamsUpper, Value *ThreadLimit,
8986 Value *IfExpr) {
8987 if (!updateToLocation(Loc))
8988 return InsertPointTy();
8989
8990 uint32_t SrcLocStrSize;
8991 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8992 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8993 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
8994
8995 // Outer allocation basicblock is the entry block of the current function.
8996 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
8997 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
8998 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
8999 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
9000 }
9001
9002 // The current basic block is split into four basic blocks. After outlining,
9003 // they will be mapped as follows:
9004 // ```
9005 // def current_fn() {
9006 // current_basic_block:
9007 // br label %teams.exit
9008 // teams.exit:
9009 // ; instructions after teams
9010 // }
9011 //
9012 // def outlined_fn() {
9013 // teams.alloca:
9014 // br label %teams.body
9015 // teams.body:
9016 // ; instructions within teams body
9017 // }
9018 // ```
9019 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
9020 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
9021 BasicBlock *AllocaBB =
9022 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
9023
9024 bool SubClausesPresent =
9025 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
9026 // Push num_teams
9027 if (!Config.isTargetDevice() && SubClausesPresent) {
9028 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
9029 "if lowerbound is non-null, then upperbound must also be non-null "
9030 "for bounds on num_teams");
9031
9032 if (NumTeamsUpper == nullptr)
9033 NumTeamsUpper = Builder.getInt32(0);
9034
9035 if (NumTeamsLower == nullptr)
9036 NumTeamsLower = NumTeamsUpper;
9037
9038 if (IfExpr) {
9039 assert(IfExpr->getType()->isIntegerTy() &&
9040 "argument to if clause must be an integer value");
9041
9042 // upper = ifexpr ? upper : 1
9043 if (IfExpr->getType() != Int1)
9044 IfExpr = Builder.CreateICmpNE(IfExpr,
9045 ConstantInt::get(IfExpr->getType(), 0));
9046 NumTeamsUpper = Builder.CreateSelect(
9047 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
9048
9049 // lower = ifexpr ? lower : 1
9050 NumTeamsLower = Builder.CreateSelect(
9051 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
9052 }
9053
9054 if (ThreadLimit == nullptr)
9055 ThreadLimit = Builder.getInt32(0);
9056
9057 Value *ThreadNum = getOrCreateThreadID(Ident);
9059 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
9060 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
9061 }
9062 // Generate the body of teams.
9063 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
9064 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
9065 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
9066 return Err;
9067
9068 OutlineInfo OI;
9069 OI.EntryBB = AllocaBB;
9070 OI.ExitBB = ExitBB;
9071 OI.OuterAllocaBB = &OuterAllocaBB;
9072
9073 // Insert fake values for global tid and bound tid.
9075 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
9077 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
9079 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
9080
9081 auto HostPostOutlineCB = [this, Ident,
9082 ToBeDeleted](Function &OutlinedFn) mutable {
9083 // The stale call instruction will be replaced with a new call instruction
9084 // for runtime call with the outlined function.
9085
9086 assert(OutlinedFn.getNumUses() == 1 &&
9087 "there must be a single user for the outlined function");
9088 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
9089 ToBeDeleted.push_back(StaleCI);
9090
9091 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
9092 "Outlined function must have two or three arguments only");
9093
9094 bool HasShared = OutlinedFn.arg_size() == 3;
9095
9096 OutlinedFn.getArg(0)->setName("global.tid.ptr");
9097 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
9098 if (HasShared)
9099 OutlinedFn.getArg(2)->setName("data");
9100
9101 // Call to the runtime function for teams in the current function.
9102 assert(StaleCI && "Error while outlining - no CallInst user found for the "
9103 "outlined function.");
9104 Builder.SetInsertPoint(StaleCI);
9105 SmallVector<Value *> Args = {
9106 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
9107 if (HasShared)
9108 Args.push_back(StaleCI->getArgOperand(2));
9110 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
9111 Args);
9112
9113 for (Instruction *I : llvm::reverse(ToBeDeleted))
9114 I->eraseFromParent();
9115 };
9116
9117 if (!Config.isTargetDevice())
9118 OI.PostOutlineCB = HostPostOutlineCB;
9119
9120 addOutlineInfo(std::move(OI));
9121
9122 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
9123
9124 return Builder.saveIP();
9125}
9126
9129 std::string VarName) {
9130 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
9132 Names.size()),
9133 Names);
9134 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
9135 M, MapNamesArrayInit->getType(),
9136 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
9137 VarName);
9138 return MapNamesArrayGlobal;
9139}
9140
9141// Create all simple and struct types exposed by the runtime and remember
9142// the llvm::PointerTypes of them for easy access later.
9143void OpenMPIRBuilder::initializeTypes(Module &M) {
9144 LLVMContext &Ctx = M.getContext();
9145 StructType *T;
9146#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
9147#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
9148 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
9149 VarName##PtrTy = PointerType::getUnqual(VarName##Ty);
9150#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
9151 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
9152 VarName##Ptr = PointerType::getUnqual(VarName);
9153#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
9154 T = StructType::getTypeByName(Ctx, StructName); \
9155 if (!T) \
9156 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
9157 VarName = T; \
9158 VarName##Ptr = PointerType::getUnqual(T);
9159#include "llvm/Frontend/OpenMP/OMPKinds.def"
9160}
9161
9164 SmallVectorImpl<BasicBlock *> &BlockVector) {
9166 BlockSet.insert(EntryBB);
9167 BlockSet.insert(ExitBB);
9168
9169 Worklist.push_back(EntryBB);
9170 while (!Worklist.empty()) {
9171 BasicBlock *BB = Worklist.pop_back_val();
9172 BlockVector.push_back(BB);
9173 for (BasicBlock *SuccBB : successors(BB))
9174 if (BlockSet.insert(SuccBB).second)
9175 Worklist.push_back(SuccBB);
9176 }
9177}
9178
9180 uint64_t Size, int32_t Flags,
9182 StringRef Name) {
9183 if (!Config.isGPU()) {
9185 M, ID, Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0,
9186 "omp_offloading_entries");
9187 return;
9188 }
9189 // TODO: Add support for global variables on the device after declare target
9190 // support.
9191 Function *Fn = dyn_cast<Function>(Addr);
9192 if (!Fn)
9193 return;
9194
9195 Module &M = *(Fn->getParent());
9196 LLVMContext &Ctx = M.getContext();
9197
9198 // Get "nvvm.annotations" metadata node.
9199 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
9200
9201 Metadata *MDVals[] = {
9202 ConstantAsMetadata::get(Fn), MDString::get(Ctx, "kernel"),
9203 ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Ctx), 1))};
9204 // Append metadata to nvvm.annotations.
9205 MD->addOperand(MDNode::get(Ctx, MDVals));
9206
9207 // Add a function attribute for the kernel.
9208 Fn->addFnAttr(Attribute::get(Ctx, "kernel"));
9209 if (T.isAMDGCN())
9210 Fn->addFnAttr("uniform-work-group-size", "true");
9211 Fn->addFnAttr(Attribute::MustProgress);
9212}
9213
9214// We only generate metadata for function that contain target regions.
9217
9218 // If there are no entries, we don't need to do anything.
9220 return;
9221
9225 16>
9226 OrderedEntries(OffloadInfoManager.size());
9227
9228 // Auxiliary methods to create metadata values and strings.
9229 auto &&GetMDInt = [this](unsigned V) {
9230 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
9231 };
9232
9233 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
9234
9235 // Create the offloading info metadata node.
9236 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
9237 auto &&TargetRegionMetadataEmitter =
9238 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
9239 const TargetRegionEntryInfo &EntryInfo,
9241 // Generate metadata for target regions. Each entry of this metadata
9242 // contains:
9243 // - Entry 0 -> Kind of this type of metadata (0).
9244 // - Entry 1 -> Device ID of the file where the entry was identified.
9245 // - Entry 2 -> File ID of the file where the entry was identified.
9246 // - Entry 3 -> Mangled name of the function where the entry was
9247 // identified.
9248 // - Entry 4 -> Line in the file where the entry was identified.
9249 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
9250 // - Entry 6 -> Order the entry was created.
9251 // The first element of the metadata node is the kind.
9252 Metadata *Ops[] = {
9253 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
9254 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
9255 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
9256 GetMDInt(E.getOrder())};
9257
9258 // Save this entry in the right position of the ordered entries array.
9259 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
9260
9261 // Add metadata to the named metadata node.
9262 MD->addOperand(MDNode::get(C, Ops));
9263 };
9264
9265 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
9266
9267 // Create function that emits metadata for each device global variable entry;
9268 auto &&DeviceGlobalVarMetadataEmitter =
9269 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
9270 StringRef MangledName,
9272 // Generate metadata for global variables. Each entry of this metadata
9273 // contains:
9274 // - Entry 0 -> Kind of this type of metadata (1).
9275 // - Entry 1 -> Mangled name of the variable.
9276 // - Entry 2 -> Declare target kind.
9277 // - Entry 3 -> Order the entry was created.
9278 // The first element of the metadata node is the kind.
9279 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
9280 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
9281
9282 // Save this entry in the right position of the ordered entries array.
9283 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
9284 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
9285
9286 // Add metadata to the named metadata node.
9287 MD->addOperand(MDNode::get(C, Ops));
9288 };
9289
9291 DeviceGlobalVarMetadataEmitter);
9292
9293 for (const auto &E : OrderedEntries) {
9294 assert(E.first && "All ordered entries must exist!");
9295 if (const auto *CE =
9296 dyn_cast<OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion>(
9297 E.first)) {
9298 if (!CE->getID() || !CE->getAddress()) {
9299 // Do not blame the entry if the parent funtion is not emitted.
9300 TargetRegionEntryInfo EntryInfo = E.second;
9301 StringRef FnName = EntryInfo.ParentName;
9302 if (!M.getNamedValue(FnName))
9303 continue;
9304 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
9305 continue;
9306 }
9307 createOffloadEntry(CE->getID(), CE->getAddress(),
9308 /*Size=*/0, CE->getFlags(),
9310 } else if (const auto *CE = dyn_cast<
9312 E.first)) {
9315 CE->getFlags());
9316 switch (Flags) {
9320 continue;
9321 if (!CE->getAddress()) {
9322 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
9323 continue;
9324 }
9325 // The vaiable has no definition - no need to add the entry.
9326 if (CE->getVarSize() == 0)
9327 continue;
9328 break;
9330 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
9331 (!Config.isTargetDevice() && CE->getAddress())) &&
9332 "Declaret target link address is set.");
9333 if (Config.isTargetDevice())
9334 continue;
9335 if (!CE->getAddress()) {
9337 continue;
9338 }
9339 break;
9340 default:
9341 break;
9342 }
9343
9344 // Hidden or internal symbols on the device are not externally visible.
9345 // We should not attempt to register them by creating an offloading
9346 // entry. Indirect variables are handled separately on the device.
9347 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
9348 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
9350 continue;
9351
9352 // Indirect globals need to use a special name that doesn't match the name
9353 // of the associated host global.
9355 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
9356 Flags, CE->getLinkage(), CE->getVarName());
9357 else
9358 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
9359 Flags, CE->getLinkage());
9360
9361 } else {
9362 llvm_unreachable("Unsupported entry kind.");
9363 }
9364 }
9365
9366 // Emit requires directive globals to a special entry so the runtime can
9367 // register them when the device image is loaded.
9368 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
9369 // entries should be redesigned to better suit this use-case.
9373 /*Name=*/"",
9375 Config.getRequiresFlags(), "omp_offloading_entries");
9376}
9377
9379 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
9380 unsigned FileID, unsigned Line, unsigned Count) {
9382 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
9383 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
9384 if (Count)
9385 OS << "_" << Count;
9386}
9387
9390 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
9392 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
9393 EntryInfo.Line, NewCount);
9394}
9395
9398 StringRef ParentName) {
9400 auto FileIDInfo = CallBack();
9401 if (auto EC = sys::fs::getUniqueID(std::get<0>(FileIDInfo), ID)) {
9402 report_fatal_error(("Unable to get unique ID for file, during "
9403 "getTargetEntryUniqueInfo, error message: " +
9404 EC.message())
9405 .c_str());
9406 }
9407
9408 return TargetRegionEntryInfo(ParentName, ID.getDevice(), ID.getFile(),
9409 std::get<1>(FileIDInfo));
9410}
9411
9413 unsigned Offset = 0;
9414 for (uint64_t Remain =
9415 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
9417 !(Remain & 1); Remain = Remain >> 1)
9418 Offset++;
9419 return Offset;
9420}
9421
9424 // Rotate by getFlagMemberOffset() bits.
9425 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
9426 << getFlagMemberOffset());
9427}
9428
9431 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
9432 // If the entry is PTR_AND_OBJ but has not been marked with the special
9433 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
9434 // marked as MEMBER_OF.
9435 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
9437 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
9440 return;
9441
9442 // Reset the placeholder value to prepare the flag for the assignment of the
9443 // proper MEMBER_OF value.
9444 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
9445 Flags |= MemberOfFlag;
9446}
9447
9451 bool IsDeclaration, bool IsExternallyVisible,
9452 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
9453 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
9454 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
9455 std::function<Constant *()> GlobalInitializer,
9456 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
9457 // TODO: convert this to utilise the IRBuilder Config rather than
9458 // a passed down argument.
9459 if (OpenMPSIMD)
9460 return nullptr;
9461
9464 CaptureClause ==
9467 SmallString<64> PtrName;
9468 {
9469 raw_svector_ostream OS(PtrName);
9470 OS << MangledName;
9471 if (!IsExternallyVisible)
9472 OS << format("_%x", EntryInfo.FileID);
9473 OS << "_decl_tgt_ref_ptr";
9474 }
9475
9476 Value *Ptr = M.getNamedValue(PtrName);
9477
9478 if (!Ptr) {
9479 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
9480 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
9481
9482 auto *GV = cast<GlobalVariable>(Ptr);
9483 GV->setLinkage(GlobalValue::WeakAnyLinkage);
9484
9485 if (!Config.isTargetDevice()) {
9486 if (GlobalInitializer)
9487 GV->setInitializer(GlobalInitializer());
9488 else
9489 GV->setInitializer(GlobalValue);
9490 }
9491
9493 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
9494 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
9495 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
9496 }
9497
9498 return cast<Constant>(Ptr);
9499 }
9500
9501 return nullptr;
9502}
9503
9507 bool IsDeclaration, bool IsExternallyVisible,
9508 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
9509 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
9510 std::vector<Triple> TargetTriple,
9511 std::function<Constant *()> GlobalInitializer,
9512 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
9513 Constant *Addr) {
9515 (TargetTriple.empty() && !Config.isTargetDevice()))
9516 return;
9517
9519 StringRef VarName;
9520 int64_t VarSize;
9522
9524 CaptureClause ==
9528 VarName = MangledName;
9529 GlobalValue *LlvmVal = M.getNamedValue(VarName);
9530
9531 if (!IsDeclaration)
9532 VarSize = divideCeil(
9534 else
9535 VarSize = 0;
9536 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
9537
9538 // This is a workaround carried over from Clang which prevents undesired
9539 // optimisation of internal variables.
9540 if (Config.isTargetDevice() &&
9541 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
9542 // Do not create a "ref-variable" if the original is not also available
9543 // on the host.
9545 return;
9546
9547 std::string RefName = createPlatformSpecificName({VarName, "ref"});
9548
9549 if (!M.getNamedValue(RefName)) {
9550 Constant *AddrRef =
9551 getOrCreateInternalVariable(Addr->getType(), RefName);
9552 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
9553 GvAddrRef->setConstant(true);
9554 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
9555 GvAddrRef->setInitializer(Addr);
9556 GeneratedRefs.push_back(GvAddrRef);
9557 }
9558 }
9559 } else {
9562 else
9564
9565 if (Config.isTargetDevice()) {
9566 VarName = (Addr) ? Addr->getName() : "";
9567 Addr = nullptr;
9568 } else {
9570 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
9571 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
9572 LlvmPtrTy, GlobalInitializer, VariableLinkage);
9573 VarName = (Addr) ? Addr->getName() : "";
9574 }
9575 VarSize = M.getDataLayout().getPointerSize();
9577 }
9578
9580 Flags, Linkage);
9581}
9582
9583/// Loads all the offload entries information from the host IR
9584/// metadata.
9586 // If we are in target mode, load the metadata from the host IR. This code has
9587 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
9588
9590 if (!MD)
9591 return;
9592
9593 for (MDNode *MN : MD->operands()) {
9594 auto &&GetMDInt = [MN](unsigned Idx) {
9595 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
9596 return cast<ConstantInt>(V->getValue())->getZExtValue();
9597 };
9598
9599 auto &&GetMDString = [MN](unsigned Idx) {
9600 auto *V = cast<MDString>(MN->getOperand(Idx));
9601 return V->getString();
9602 };
9603
9604 switch (GetMDInt(0)) {
9605 default:
9606 llvm_unreachable("Unexpected metadata!");
9607 break;
9610 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
9611 /*DeviceID=*/GetMDInt(1),
9612 /*FileID=*/GetMDInt(2),
9613 /*Line=*/GetMDInt(4),
9614 /*Count=*/GetMDInt(5));
9616 /*Order=*/GetMDInt(6));
9617 break;
9618 }
9622 /*MangledName=*/GetMDString(1),
9624 /*Flags=*/GetMDInt(2)),
9625 /*Order=*/GetMDInt(3));
9626 break;
9627 }
9628 }
9629}
9630
9632 if (HostFilePath.empty())
9633 return;
9634
9635 auto Buf = MemoryBuffer::getFile(HostFilePath);
9636 if (std::error_code Err = Buf.getError()) {
9637 report_fatal_error(("error opening host file from host file path inside of "
9638 "OpenMPIRBuilder: " +
9639 Err.message())
9640 .c_str());
9641 }
9642
9643 LLVMContext Ctx;
9645 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
9646 if (std::error_code Err = M.getError()) {
9648 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
9649 .c_str());
9650 }
9651
9652 loadOffloadInfoMetadata(*M.get());
9653}
9654
9655//===----------------------------------------------------------------------===//
9656// OffloadEntriesInfoManager
9657//===----------------------------------------------------------------------===//
9658
9660 return OffloadEntriesTargetRegion.empty() &&
9661 OffloadEntriesDeviceGlobalVar.empty();
9662}
9663
9664unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
9665 const TargetRegionEntryInfo &EntryInfo) const {
9666 auto It = OffloadEntriesTargetRegionCount.find(
9667 getTargetRegionEntryCountKey(EntryInfo));
9668 if (It == OffloadEntriesTargetRegionCount.end())
9669 return 0;
9670 return It->second;
9671}
9672
9673void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
9674 const TargetRegionEntryInfo &EntryInfo) {
9675 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
9676 EntryInfo.Count + 1;
9677}
9678
9679/// Initialize target region entry.
9681 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
9682 OffloadEntriesTargetRegion[EntryInfo] =
9683 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
9684 OMPTargetRegionEntryTargetRegion);
9685 ++OffloadingEntriesNum;
9686}
9687
9691 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
9692
9693 // Update the EntryInfo with the next available count for this location.
9694 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
9695
9696 // If we are emitting code for a target, the entry is already initialized,
9697 // only has to be registered.
9698 if (OMPBuilder->Config.isTargetDevice()) {
9699 // This could happen if the device compilation is invoked standalone.
9700 if (!hasTargetRegionEntryInfo(EntryInfo)) {
9701 return;
9702 }
9703 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
9704 Entry.setAddress(Addr);
9705 Entry.setID(ID);
9706 Entry.setFlags(Flags);
9707 } else {
9709 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
9710 return;
9711 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
9712 "Target region entry already registered!");
9713 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
9714 OffloadEntriesTargetRegion[EntryInfo] = Entry;
9715 ++OffloadingEntriesNum;
9716 }
9717 incrementTargetRegionEntryInfoCount(EntryInfo);
9718}
9719
9721 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
9722
9723 // Update the EntryInfo with the next available count for this location.
9724 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
9725
9726 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
9727 if (It == OffloadEntriesTargetRegion.end()) {
9728 return false;
9729 }
9730 // Fail if this entry is already registered.
9731 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
9732 return false;
9733 return true;
9734}
9735
9737 const OffloadTargetRegionEntryInfoActTy &Action) {
9738 // Scan all target region entries and perform the provided action.
9739 for (const auto &It : OffloadEntriesTargetRegion) {
9740 Action(It.first, It.second);
9741 }
9742}
9743
9745 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
9746 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
9747 ++OffloadingEntriesNum;
9748}
9749
9751 StringRef VarName, Constant *Addr, int64_t VarSize,
9753 if (OMPBuilder->Config.isTargetDevice()) {
9754 // This could happen if the device compilation is invoked standalone.
9755 if (!hasDeviceGlobalVarEntryInfo(VarName))
9756 return;
9757 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
9758 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
9759 if (Entry.getVarSize() == 0) {
9760 Entry.setVarSize(VarSize);
9761 Entry.setLinkage(Linkage);
9762 }
9763 return;
9764 }
9765 Entry.setVarSize(VarSize);
9766 Entry.setLinkage(Linkage);
9767 Entry.setAddress(Addr);
9768 } else {
9769 if (hasDeviceGlobalVarEntryInfo(VarName)) {
9770 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
9771 assert(Entry.isValid() && Entry.getFlags() == Flags &&
9772 "Entry not initialized!");
9773 if (Entry.getVarSize() == 0) {
9774 Entry.setVarSize(VarSize);
9775 Entry.setLinkage(Linkage);
9776 }
9777 return;
9778 }
9780 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
9781 Addr, VarSize, Flags, Linkage,
9782 VarName.str());
9783 else
9784 OffloadEntriesDeviceGlobalVar.try_emplace(
9785 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
9786 ++OffloadingEntriesNum;
9787 }
9788}
9789
9792 // Scan all target region entries and perform the provided action.
9793 for (const auto &E : OffloadEntriesDeviceGlobalVar)
9794 Action(E.getKey(), E.getValue());
9795}
9796
9797//===----------------------------------------------------------------------===//
9798// CanonicalLoopInfo
9799//===----------------------------------------------------------------------===//
9800
9801void CanonicalLoopInfo::collectControlBlocks(
9803 // We only count those BBs as control block for which we do not need to
9804 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
9805 // flow. For consistency, this also means we do not add the Body block, which
9806 // is just the entry to the body code.
9807 BBs.reserve(BBs.size() + 6);
9808 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
9809}
9810
9812 assert(isValid() && "Requires a valid canonical loop");
9813 for (BasicBlock *Pred : predecessors(Header)) {
9814 if (Pred != Latch)
9815 return Pred;
9816 }
9817 llvm_unreachable("Missing preheader");
9818}
9819
9820void CanonicalLoopInfo::setTripCount(Value *TripCount) {
9821 assert(isValid() && "Requires a valid canonical loop");
9822
9823 Instruction *CmpI = &getCond()->front();
9824 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
9825 CmpI->setOperand(1, TripCount);
9826
9827#ifndef NDEBUG
9828 assertOK();
9829#endif
9830}
9831
9832void CanonicalLoopInfo::mapIndVar(
9833 llvm::function_ref<Value *(Instruction *)> Updater) {
9834 assert(isValid() && "Requires a valid canonical loop");
9835
9836 Instruction *OldIV = getIndVar();
9837
9838 // Record all uses excluding those introduced by the updater. Uses by the
9839 // CanonicalLoopInfo itself to keep track of the number of iterations are
9840 // excluded.
9841 SmallVector<Use *> ReplacableUses;
9842 for (Use &U : OldIV->uses()) {
9843 auto *User = dyn_cast<Instruction>(U.getUser());
9844 if (!User)
9845 continue;
9846 if (User->getParent() == getCond())
9847 continue;
9848 if (User->getParent() == getLatch())
9849 continue;
9850 ReplacableUses.push_back(&U);
9851 }
9852
9853 // Run the updater that may introduce new uses
9854 Value *NewIV = Updater(OldIV);
9855
9856 // Replace the old uses with the value returned by the updater.
9857 for (Use *U : ReplacableUses)
9858 U->set(NewIV);
9859
9860#ifndef NDEBUG
9861 assertOK();
9862#endif
9863}
9864
9866#ifndef NDEBUG
9867 // No constraints if this object currently does not describe a loop.
9868 if (!isValid())
9869 return;
9870
9871 BasicBlock *Preheader = getPreheader();
9872 BasicBlock *Body = getBody();
9873 BasicBlock *After = getAfter();
9874
9875 // Verify standard control-flow we use for OpenMP loops.
9876 assert(Preheader);
9877 assert(isa<BranchInst>(Preheader->getTerminator()) &&
9878 "Preheader must terminate with unconditional branch");
9879 assert(Preheader->getSingleSuccessor() == Header &&
9880 "Preheader must jump to header");
9881
9882 assert(Header);
9883 assert(isa<BranchInst>(Header->getTerminator()) &&
9884 "Header must terminate with unconditional branch");
9885 assert(Header->getSingleSuccessor() == Cond &&
9886 "Header must jump to exiting block");
9887
9888 assert(Cond);
9889 assert(Cond->getSinglePredecessor() == Header &&
9890 "Exiting block only reachable from header");
9891
9892 assert(isa<BranchInst>(Cond->getTerminator()) &&
9893 "Exiting block must terminate with conditional branch");
9894 assert(size(successors(Cond)) == 2 &&
9895 "Exiting block must have two successors");
9896 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
9897 "Exiting block's first successor jump to the body");
9898 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
9899 "Exiting block's second successor must exit the loop");
9900
9901 assert(Body);
9902 assert(Body->getSinglePredecessor() == Cond &&
9903 "Body only reachable from exiting block");
9904 assert(!isa<PHINode>(Body->front()));
9905
9906 assert(Latch);
9907 assert(isa<BranchInst>(Latch->getTerminator()) &&
9908 "Latch must terminate with unconditional branch");
9909 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
9910 // TODO: To support simple redirecting of the end of the body code that has
9911 // multiple; introduce another auxiliary basic block like preheader and after.
9912 assert(Latch->getSinglePredecessor() != nullptr);
9913 assert(!isa<PHINode>(Latch->front()));
9914
9915 assert(Exit);
9916 assert(isa<BranchInst>(Exit->getTerminator()) &&
9917 "Exit block must terminate with unconditional branch");
9918 assert(Exit->getSingleSuccessor() == After &&
9919 "Exit block must jump to after block");
9920
9921 assert(After);
9922 assert(After->getSinglePredecessor() == Exit &&
9923 "After block only reachable from exit block");
9924 assert(After->empty() || !isa<PHINode>(After->front()));
9925
9926 Instruction *IndVar = getIndVar();
9927 assert(IndVar && "Canonical induction variable not found?");
9928 assert(isa<IntegerType>(IndVar->getType()) &&
9929 "Induction variable must be an integer");
9930 assert(cast<PHINode>(IndVar)->getParent() == Header &&
9931 "Induction variable must be a PHI in the loop header");
9932 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
9933 assert(
9934 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
9935 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
9936
9937 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
9938 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
9939 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
9940 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
9941 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
9942 ->isOne());
9943
9944 Value *TripCount = getTripCount();
9945 assert(TripCount && "Loop trip count not found?");
9946 assert(IndVar->getType() == TripCount->getType() &&
9947 "Trip count and induction variable must have the same type");
9948
9949 auto *CmpI = cast<CmpInst>(&Cond->front());
9950 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
9951 "Exit condition must be a signed less-than comparison");
9952 assert(CmpI->getOperand(0) == IndVar &&
9953 "Exit condition must compare the induction variable");
9954 assert(CmpI->getOperand(1) == TripCount &&
9955 "Exit condition must compare with the trip count");
9956#endif
9957}
9958
9960 Header = nullptr;
9961 Cond = nullptr;
9962 Latch = nullptr;
9963 Exit = nullptr;
9964}
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE() pulls the operator overloads used by LLVM_MARK_AS_BITMASK_EN...
Definition: BitmaskEnum.h:83
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:42
BlockVerifier::State From
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition: IVUsers.cpp:48
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:557
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file contains the declarations for metadata subclasses.
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Definition: OMPConstants.h:75
Provides definitions for Target specific Grid Values.
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI)
Create an entry point for a target task with the following.
static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause)
Determine the schedule type using schedule and ordering clause arguments.
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static MDNode * getNVPTXMDNode(Function &Kernel, StringRef Name)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, SmallVector< llvm::OpenMPIRBuilder::DependData > Dependencies={}, bool HasNoWait=false)
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, Type *ParallelTaskPtr, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
unsigned unsigned DefaultVal
raw_pwrite_stream & OS
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Definition: Instructions.h:63
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:124
PointerType * getType() const
Overload to return most specific pointer type.
Definition: Instructions.h:99
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:117
unsigned getAddressSpace() const
Return the address space for the allocation.
Definition: Instructions.h:104
std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
Definition: Instructions.h:128
const Value * getArraySize() const
Get the number of elements allocated.
Definition: Instructions.h:95
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
Definition: PassManager.h:471
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:157
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
Class to represent array types.
Definition: DerivedTypes.h:395
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:501
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:652
std::pair< LoadInst *, AllocaInst * > EmitAtomicLoadLibcall(AtomicOrdering AO)
Definition: Atomic.cpp:107
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:704
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:716
@ Add
*p = old + v
Definition: Instructions.h:720
@ FAdd
*p = old + v
Definition: Instructions.h:741
@ USubCond
Subtract only if no unsigned overflow.
Definition: Instructions.h:764
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:734
@ Or
*p = old | v
Definition: Instructions.h:728
@ Sub
*p = old - v
Definition: Instructions.h:722
@ And
*p = old & v
Definition: Instructions.h:724
@ Xor
*p = old ^ v
Definition: Instructions.h:730
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
Definition: Instructions.h:768
@ FSub
*p = old - v
Definition: Instructions.h:744
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:756
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:732
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:738
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:752
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:736
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:748
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:760
@ Nand
*p = ~(old & v)
Definition: Instructions.h:726
AttrBuilder & addAttribute(Attribute::AttrKind Val)
Add an attribute to the builder.
AttrBuilder & removeAttribute(Attribute::AttrKind Val)
Remove an attribute from the builder.
AttributeSet getFnAttrs() const
The function attributes are returned.
AttributeList addFnAttributes(LLVMContext &C, const AttrBuilder &B) const
Add function attribute to the list.
Definition: Attributes.h:599
AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
Definition: Attributes.cpp:933
AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
Definition: Attributes.cpp:918
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:95
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:392
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
Definition: BasicBlock.cpp:662
iterator end()
Definition: BasicBlock.h:461
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:416
reverse_iterator rbegin()
Definition: BasicBlock.h:464
bool empty() const
Definition: BasicBlock.h:470
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:367
const Instruction & front() const
Definition: BasicBlock.h:471
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:212
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:577
const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
Definition: BasicBlock.cpp:497
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:459
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:179
const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
Definition: BasicBlock.cpp:467
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:489
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
Definition: BasicBlock.cpp:279
reverse_iterator rend()
Definition: BasicBlock.h:466
const Instruction * getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
Definition: BasicBlock.cpp:386
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition: BasicBlock.h:376
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition: BasicBlock.h:631
const Instruction & back() const
Definition: BasicBlock.h:473
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:292
void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Definition: BasicBlock.cpp:516
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
void setDoesNotThrow()
Definition: InstrTypes.h:1926
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition: InstrTypes.h:1261
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Definition: InstrTypes.h:1267
unsigned arg_size() const
Definition: InstrTypes.h:1284
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:679
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:677
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
A cache for the CodeExtractor analysis.
Definition: CodeExtractor.h:46
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:85
void findAllocas(const CodeExtractorAnalysisCache &CEAC, ValueSet &SinkCands, ValueSet &HoistCands, BasicBlock *&ExitBlock) const
Find the set of allocas whose life ranges are contained within the outlined region.
Function * extractCodeRegion(const CodeExtractorAnalysisCache &CEAC)
Perform the extraction, returning the new function.
void findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, const ValueSet &Allocas, bool CollectGlobalInputs=false) const
Compute the set of input values and output values for the code.
bool isEligible() const
Test whether this code extractor is eligible.
void excludeArgFromAggregate(Value *Arg)
Exclude a value from aggregate argument passing when extracting a code region, passing it instead as ...
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1312
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:528
static Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
Definition: Constants.cpp:2990
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition: Constants.h:709
static Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
Definition: Constants.cpp:2253
static Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
Definition: Constants.cpp:2268
static Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2333
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:866
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.h:126
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:873
static ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
Definition: Constants.cpp:1826
static Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1378
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
DISubprogram * getSubprogram() const
Get the subprogram for this scope.
Debug location.
Subprogram description.
DISPFlags
Debug info subprogram flags.
Type array for a subprogram.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
unsigned getDefaultGlobalsAddressSpace() const
Definition: DataLayout.h:247
Align getABIIntegerTypeAlignment(unsigned BitWidth) const
Returns the minimum ABI-required alignment for an integer type of the specified bitwidth.
Definition: DataLayout.h:486
unsigned getAllocaAddrSpace() const
Definition: DataLayout.h:229
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
unsigned getPointerSize(unsigned AS=0) const
Layout pointer size in bytes, rounded up to a whole number of bytes.
Definition: DataLayout.cpp:739
unsigned getIndexSizeInBits(unsigned AS) const
Size in bits of index used for address calculation in getelementptr.
Definition: DataLayout.h:369
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:421
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:847
A debug info location.
Definition: DebugLoc.h:33
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Definition: Dominators.cpp:371
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Lightweight error class with error context and mandatory checking.
Definition: Error.h:160
static ErrorSuccess success()
Create a success value.
Definition: Error.h:337
Tagged union holding either a T or a Error.
Definition: Error.h:481
Error takeError()
Take ownership of the stored error.
Definition: Error.h:608
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:170
Class to represent function types.
Definition: DerivedTypes.h:105
static FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition: Function.cpp:641
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition: Function.h:173
const BasicBlock & getEntryBlock() const
Definition: Function.h:809
bool empty() const
Definition: Function.h:859
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition: Function.cpp:454
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:373
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:766
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:778
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:353
const Function & getFunction() const
Definition: Function.h:171
iterator begin()
Definition: Function.h:853
arg_iterator arg_begin()
Definition: Function.h:868
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition: Function.h:356
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition: Function.cpp:669
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition: Function.h:754
size_t arg_size() const
Definition: Function.h:901
Type * getReturnType() const
Returns the type of the ret val.
Definition: Function.h:221
iterator end()
Definition: Function.h:855
void setCallingConv(CallingConv::ID CC)
Definition: Function.h:281
Argument * getArg(unsigned i) const
Definition: Function.h:886
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition: Value.h:589
void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
Definition: Metadata.cpp:1565
LinkageTypes getLinkage() const
Definition: GlobalValue.h:546
void setLinkage(LinkageTypes LT)
Definition: GlobalValue.h:537
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
void setDSOLocal(bool Local)
Definition: GlobalValue.h:303
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:294
@ HiddenVisibility
The GV is hidden.
Definition: GlobalValue.h:68
@ ProtectedVisibility
The GV is protected.
Definition: GlobalValue.h:69
void setVisibility(VisibilityTypes V)
Definition: GlobalValue.h:254
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition: GlobalValue.h:51
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:60
@ CommonLinkage
Tentative definitions.
Definition: GlobalValue.h:62
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:59
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:57
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition: GlobalValue.h:56
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition: GlobalValue.h:58
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:55
Type * getValueType() const
Definition: GlobalValue.h:296
InsertPoint - A saved insertion point.
Definition: IRBuilder.h:276
BasicBlock * getBlock() const
Definition: IRBuilder.h:291
bool isSet() const
Returns true if this insert point is set.
Definition: IRBuilder.h:289
BasicBlock::iterator getPoint() const
Definition: IRBuilder.h:292
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:113
Value * CreateNUWMul(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1417
Value * CreatePtrDiff(Type *ElemTy, Value *LHS, Value *RHS, const Twine &Name="")
Return the i64 difference between two pointer values, dividing out the size of the pointed-to objects...
Definition: IRBuilder.cpp:1075
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2286
AtomicCmpXchgInst * CreateAtomicCmpXchg(Value *Ptr, Value *Cmp, Value *New, MaybeAlign Align, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1849
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1887
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
Definition: IRBuilder.h:1781
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2562
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:558
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2294
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1815
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2051
UnreachableInst * CreateUnreachable()
Definition: IRBuilder.h:1306
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2199
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2555
CallInst * CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue, unsigned Alignment, Value *OffsetValue=nullptr)
Create an assume intrinsic call that represents an alignment assumption on the provided pointer.
Definition: IRBuilder.cpp:1265
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1053
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:194
Value * CreateStructGEP(Type *Ty, Value *Ptr, unsigned Idx, const Twine &Name="")
Definition: IRBuilder.h:1980
IntegerType * getIndexTy(const DataLayout &DL, unsigned AddrSpace)
Fetch the type of an integer that should be used to index GEP operations within AddressSpace.
Definition: IRBuilder.h:600
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2045
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2147
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:545
Value * CreateNSWAdd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1379
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:193
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:239
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:550
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition: IRBuilder.h:1882
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2211
Value * CreateUDiv(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1421
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2274
Value * CreateNUWAdd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1383
IntegerType * getInt16Ty()
Fetch the type representing a 16-bit integer.
Definition: IRBuilder.h:540
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1874
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:510
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition: IRBuilder.h:1733
InsertPoint saveIP() const
Returns the current insert point.
Definition: IRBuilder.h:296
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:505
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2404
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2435
SwitchInst * CreateSwitch(Value *V, BasicBlock *Dest, unsigned NumCases=10, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a switch instruction with the specified value, default dest, and with a hint for the number of...
Definition: IRBuilder.h:1187
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2270
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:164
DebugLoc getCurrentDebugLocation() const
Get location information used by debugging information.
Definition: IRBuilder.cpp:64
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1387
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2152
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition: IRBuilder.h:516
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1164
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1798
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1459
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2033
LLVMContext & getContext() const
Definition: IRBuilder.h:195
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1518
ReturnInst * CreateRetVoid()
Create a 'ret void' instruction.
Definition: IRBuilder.h:1134
Value * CreateConstInBoundsGEP2_32(Type *Ty, Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name="")
Definition: IRBuilder.h:1921
Value * CreateConstInBoundsGEP2_64(Type *Ty, Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name="")
Definition: IRBuilder.h:1967
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1811
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1370
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2142
Value * CreateExactUDiv(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1430
Value * CreateIsNotNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg != 0.
Definition: IRBuilder.h:2588
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2449
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1862
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2019
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1540
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:588
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1158
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:188
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2302
ConstantInt * getInt16(uint16_t C)
Get a constant 16-bit value.
Definition: IRBuilder.h:500
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2282
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2225
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition: IRBuilder.h:308
Value * CreateIsNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg == 0.
Definition: IRBuilder.h:2583
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
Type * getVoidTy()
Fetch the type representing void.
Definition: IRBuilder.h:583
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1834
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1499
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1562
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2380
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:535
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1447
CallInst * CreateMemCpy(Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, uint64_t Size, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *TBAAStructTag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memcpy between the specified pointers.
Definition: IRBuilder.h:677
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2066
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2157
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1404
GlobalVariable * CreateGlobalString(StringRef Str, const Twine &Name="", unsigned AddressSpace=0, Module *M=nullptr, bool AddNull=true)
Make a new global variable with initializer type i8*.
Definition: IRBuilder.cpp:44
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:80
void moveBeforePreserving(Instruction *MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:475
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:68
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:390
BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:472
void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
Definition: DerivedTypes.h:42
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:241
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:211
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition: LoopInfo.cpp:969
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Definition: MDBuilder.cpp:118
Metadata node.
Definition: Metadata.h:1069
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1077
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1553
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1430
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1428
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1545
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:606
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type size() const
Definition: MapVector.h:60
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
NamedMDNode * getNamedMetadata(StringRef Name) const
Return the first NamedMDNode in the module with the specified name.
Definition: Module.cpp:297
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:302
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:228
StringRef getName() const
Get a short "name" for the module.
Definition: Module.h:285
const std::string & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition: Module.h:298
iterator_range< global_iterator > globals()
Definition: Module.h:702
const FunctionListType & getFunctionList() const
Get the Module's list of functions (constant).
Definition: Module.h:614
GlobalVariable * getGlobalVariable(StringRef Name) const
Look up the specified global variable in the module symbol table.
Definition: Module.h:447
GlobalValue * getNamedValue(StringRef Name) const
Return the global value in the module with the specified name, of arbitrary type.
Definition: Module.cpp:170
NamedMDNode * getOrInsertNamedMetadata(StringRef Name)
Return the named MDNode in the module with the specified name.
Definition: Module.cpp:304
const GlobalVariable * getNamedGlobal(StringRef Name) const
Return the global variable in the module with the specified name, of arbitrary type.
Definition: Module.h:462
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:294
A tuple of MDNodes.
Definition: Metadata.h:1733
iterator_range< op_iterator > operands()
Definition: Metadata.h:1829
void addOperand(MDNode *M)
Definition: Metadata.cpp:1431
@ OffloadingEntryInfoTargetRegion
Entry is a target region.
Definition: OMPIRBuilder.h:244
@ OffloadingEntryInfoDeviceGlobalVar
Entry is a declare target variable.
Definition: OMPIRBuilder.h:246
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
Definition: OMPIRBuilder.h:377
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
Definition: OMPIRBuilder.h:379
void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
Definition: OMPIRBuilder.h:297
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
Definition: OMPIRBuilder.h:299
void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
unsigned size() const
Return number of entries defined so far.
Definition: OMPIRBuilder.h:288
void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
Definition: OMPIRBuilder.h:357
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
Definition: OMPIRBuilder.h:363
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
Definition: OMPIRBuilder.h:369
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
Definition: OMPIRBuilder.h:367
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
Definition: OMPIRBuilder.h:361
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
Definition: OMPIRBuilder.h:359
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
Definition: OMPIRBuilder.h:433
bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
Definition: OMPIRBuilder.h:93
void setGridValue(omp::GV G)
Definition: OMPIRBuilder.h:189
StringRef separator() const
Definition: OMPIRBuilder.h:175
int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
StringRef firstSeparator() const
Definition: OMPIRBuilder.h:165
std::optional< bool > EmitLLVMUsedMetaInfo
Flag for specifying if LLVMUsed information should be emitted.
Definition: OMPIRBuilder.h:106
omp::GV getGridValue() const
Definition: OMPIRBuilder.h:148
void setHasRequiresReverseOffload(bool Value)
bool hasRequiresUnifiedSharedMemory() const
void setHasRequiresUnifiedSharedMemory(bool Value)
bool hasRequiresDynamicAllocators() const
bool openMPOffloadMandatory() const
Definition: OMPIRBuilder.h:142
void setHasRequiresUnifiedAddress(bool Value)
void setHasRequiresDynamicAllocators(bool Value)
void setEmitLLVMUsed(bool Value=true)
Definition: OMPIRBuilder.h:185
bool hasRequiresReverseOffload() const
bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
Definition: OMPIRBuilder.h:474
InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for '#omp ordered [threads | simd]'.
Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
InsertPointOrErrorTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for '#omp cancel'.
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
InsertPointOrErrorTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, SmallVector< DependData > Dependencies={}, bool Mergeable=false, Value *EventHandle=nullptr, Value *Priority=nullptr)
Generator for #omp task
void createTaskyield(const LocationDescription &Loc)
Generator for '#omp taskyield'.
std::function< Error(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
Definition: OMPIRBuilder.h:543
void emitBranch(BasicBlock *Target)
InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO)
Emit atomic write for : X = Expr — Only Scalar data types.
static void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
static TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
GlobalVariable * emitKernelExecutionMode(StringRef KernelName, omp::OMPTgtExecModeFlags Mode)
Emit the kernel execution mode.
void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
InsertPointOrErrorTy emitTargetTask(TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait)
Generate a target-task for the target construct.
void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
static std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO)
Emit atomic Read for : V = X — Only Scalar data types.
Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
void emitUsed(StringRef Name, ArrayRef< llvm::WeakTrackingVH > List)
Emit the llvm.used metadata.
InsertPointOrErrorTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for '#omp single'.
InsertPointOrErrorTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
void createTaskwait(const LocationDescription &Loc)
Generator for '#omp taskwait'.
CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
InsertPointOrErrorTy createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for '#omp parallel'.
omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
Error emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?...
InsertPointOrErrorTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for '#omp ordered depend (source | sink)'.
InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
void emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
InsertPointTy createTargetInit(const LocationDescription &Loc, const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs)
The omp target interface.
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_Alloc.
void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
InsertPointOrErrorTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp section'.
void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
void emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
InsertPointOrErrorTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp master'.
Error emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective, FinalizeCallbackTy ExitCB={})
Generate control flow and cleanup for cancellation.
InsertPointOrErrorTy emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
InsertPointOrErrorTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, SmallVector< DependData > Dependencies={}, bool HasNowait=false)
Generator for '#omp target'.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, unsigned AddressSpace=0)
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
InsertPointOrErrorTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, bool IsNoWait=false, bool IsTeamsReduction=false, bool HasDistribute=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
Function * emitUserDefinedMapper(function_ref< MapInfosTy &(InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)> PrivAndGenMapInfoCB, llvm::Type *ElemTy, StringRef FuncName, function_ref< bool(unsigned int, Function **)> CustomMapperCB=nullptr)
Emit the user-defined mapper function.
CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
Definition: OMPIRBuilder.h:520
InsertPointOrErrorTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false)
Generator for '#omp reduction'.
GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
std::function< Expected< Function * >(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
static void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
InsertPointOrErrorTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
InsertPointOrErrorTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for '#omp critical'.
void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
InsertPointOrErrorTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop)
Modifies the canonical loop to be a workshare loop.
void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
InsertPointOrErrorTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for '#omp masked'.
Expected< CanonicalLoopInfo * > createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
InsertPointOrErrorTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointOrErrorTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for '#omp target data'.
BodyGenTy
Type of BodyGen to use for region codegen.
InsertPointOrErrorTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
Expected< InsertPointTy > InsertPointOrErrorTy
Type used to represent an insertion point or an error value.
Definition: OMPIRBuilder.h:523
InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
InsertPointOrErrorTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for '#omp sections'.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
void createFlush(const LocationDescription &Loc)
Generator for '#omp flush'.
Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Class to represent pointers.
Definition: DerivedTypes.h:670
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:686
Analysis pass that exposes the ScalarEvolution for a function.
ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
A vector that has set insertion semantics.
Definition: SetVector.h:57
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition: SetVector.h:237
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition: SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
Definition: SmallString.h:254
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void reserve(size_type N)
Definition: SmallVector.h:663
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
void setAlignment(Align Align)
Definition: Instructions.h:337
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Definition: Instructions.h:364
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition: StringMap.h:128
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: StringMap.h:253
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:700
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:147
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:150
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition: StringRef.h:451
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:277
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition: StringRef.h:616
Class to represent struct types.
Definition: DerivedTypes.h:218
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:406
static StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:612
Multiway switch.
void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(StringRef TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition: Triple.h:990
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition: Triple.h:1048
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition: Triple.h:1058
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
std::string str() const
Return the twine contents as a std::string.
Definition: Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
Type * getStructElementType(unsigned N) const
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isStructTy() const
True if this is an instance of StructType.
Definition: Type.h:258
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition: UnrollLoop.h:128
bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition: UnrollLoop.h:144
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
void setOperand(unsigned i, Value *Val)
Definition: User.h:233
Value * getOperand(unsigned i) const
Definition: User.h:228
ValueT lookup(const KeyT &Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: ValueMap.h:164
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
User * user_back()
Definition: Value.h:407
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:927
void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:542
User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition: Value.cpp:179
bool use_empty() const
Definition: Value.h:344
user_iterator user_end()
Definition: Value.h:405
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
iterator_range< use_iterator > uses()
Definition: Value.h:376
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:353
iterator insertAfter(iterator where, pointer New)
Definition: ilist.h:174
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Exit
Definition: COFF.h:845
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
void emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name, uint64_t Size, int32_t Flags, int32_t Data, StringRef SectionName)
Create an offloading section struct used to register this global at runtime.
Definition: Utility.cpp:77
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
Definition: OMPConstants.h:195
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
Definition: OMPConstants.h:252
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
Definition: OMPConstants.h:65
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
Definition: OMPConstants.h:45
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
WorksharingLoopType
A type of worksharing loop construct.
Definition: OMPConstants.h:283
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
Definition: OMPConstants.h:267
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
std::error_code getUniqueID(const Twine Path, UniqueID &Result)
Definition: Path.cpp:787
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:854
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition: STLExtras.h:864
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:649
auto successors(const MachineBasicBlock *BB)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
Definition: BitcodeReader.h:66
bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
Definition: LoopPeel.cpp:870
void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition: Error.h:756
bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
DWARFExpression::Operation Op
void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
@ Continue
Definition: DWP.h:21
void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
a struct to pack relevant information while generating atomic Ops
A struct to pack the relevant information for an OpenMP depend clause.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
Definition: OMPIRBuilder.h:645
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
Value * DynCGGroupMem
The size of the dynamic shared memory.
ArrayRef< Value * > NumThreads
The number of threads.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
Container to pass the default attributes with which a kernel must be launched, used to set kernel att...
Container to pass LLVM IR runtime values or constants related to the number of teams and threads with...
Value * MaxThreads
'parallel' construct 'num_threads' clause value, if present and it is an SPMD kernel.
Value * LoopTripCount
Total number of iterations of the SPMD or Generic-SPMD kernel or null if it is a generic kernel.
Data structure to contain the information needed to uniquely identify a target entry.
Definition: OMPIRBuilder.h:203
static void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static const Target * lookupTarget(StringRef Triple, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...
Definition: OMPGridValues.h:57
unsigned GV_Warp_Size
The default value of maximum number of threads in a worker warp.
Definition: OMPGridValues.h:61