LLVM 20.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/StringRef.h"
29#include "llvm/IR/Attributes.h"
30#include "llvm/IR/BasicBlock.h"
31#include "llvm/IR/CFG.h"
32#include "llvm/IR/CallingConv.h"
33#include "llvm/IR/Constant.h"
34#include "llvm/IR/Constants.h"
35#include "llvm/IR/DIBuilder.h"
38#include "llvm/IR/Function.h"
40#include "llvm/IR/IRBuilder.h"
41#include "llvm/IR/LLVMContext.h"
42#include "llvm/IR/MDBuilder.h"
43#include "llvm/IR/Metadata.h"
45#include "llvm/IR/PassManager.h"
47#include "llvm/IR/Value.h"
59
60#include <cstdint>
61#include <optional>
62
63#define DEBUG_TYPE "openmp-ir-builder"
64
65using namespace llvm;
66using namespace omp;
67
68static cl::opt<bool>
69 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
70 cl::desc("Use optimistic attributes describing "
71 "'as-if' properties of runtime calls."),
72 cl::init(false));
73
75 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
76 cl::desc("Factor for the unroll threshold to account for code "
77 "simplifications still taking place"),
78 cl::init(1.5));
79
80#ifndef NDEBUG
81/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
82/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
83/// an InsertPoint stores the instruction before something is inserted. For
84/// instance, if both point to the same instruction, two IRBuilders alternating
85/// creating instruction will cause the instructions to be interleaved.
88 if (!IP1.isSet() || !IP2.isSet())
89 return false;
90 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
91}
92
94 // Valid ordered/unordered and base algorithm combinations.
95 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
96 case OMPScheduleType::UnorderedStaticChunked:
97 case OMPScheduleType::UnorderedStatic:
98 case OMPScheduleType::UnorderedDynamicChunked:
99 case OMPScheduleType::UnorderedGuidedChunked:
100 case OMPScheduleType::UnorderedRuntime:
101 case OMPScheduleType::UnorderedAuto:
102 case OMPScheduleType::UnorderedTrapezoidal:
103 case OMPScheduleType::UnorderedGreedy:
104 case OMPScheduleType::UnorderedBalanced:
105 case OMPScheduleType::UnorderedGuidedIterativeChunked:
106 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
107 case OMPScheduleType::UnorderedSteal:
108 case OMPScheduleType::UnorderedStaticBalancedChunked:
109 case OMPScheduleType::UnorderedGuidedSimd:
110 case OMPScheduleType::UnorderedRuntimeSimd:
111 case OMPScheduleType::OrderedStaticChunked:
112 case OMPScheduleType::OrderedStatic:
113 case OMPScheduleType::OrderedDynamicChunked:
114 case OMPScheduleType::OrderedGuidedChunked:
115 case OMPScheduleType::OrderedRuntime:
116 case OMPScheduleType::OrderedAuto:
117 case OMPScheduleType::OrderdTrapezoidal:
118 case OMPScheduleType::NomergeUnorderedStaticChunked:
119 case OMPScheduleType::NomergeUnorderedStatic:
120 case OMPScheduleType::NomergeUnorderedDynamicChunked:
121 case OMPScheduleType::NomergeUnorderedGuidedChunked:
122 case OMPScheduleType::NomergeUnorderedRuntime:
123 case OMPScheduleType::NomergeUnorderedAuto:
124 case OMPScheduleType::NomergeUnorderedTrapezoidal:
125 case OMPScheduleType::NomergeUnorderedGreedy:
126 case OMPScheduleType::NomergeUnorderedBalanced:
127 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
128 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
129 case OMPScheduleType::NomergeUnorderedSteal:
130 case OMPScheduleType::NomergeOrderedStaticChunked:
131 case OMPScheduleType::NomergeOrderedStatic:
132 case OMPScheduleType::NomergeOrderedDynamicChunked:
133 case OMPScheduleType::NomergeOrderedGuidedChunked:
134 case OMPScheduleType::NomergeOrderedRuntime:
135 case OMPScheduleType::NomergeOrderedAuto:
136 case OMPScheduleType::NomergeOrderedTrapezoidal:
137 break;
138 default:
139 return false;
140 }
141
142 // Must not set both monotonicity modifiers at the same time.
143 OMPScheduleType MonotonicityFlags =
144 SchedType & OMPScheduleType::MonotonicityMask;
145 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
146 return false;
147
148 return true;
149}
150#endif
151
152static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
153 if (T.isAMDGPU()) {
154 StringRef Features =
155 Kernel->getFnAttribute("target-features").getValueAsString();
156 if (Features.count("+wavefrontsize64"))
157 return omp::getAMDGPUGridValues<64>();
158 return omp::getAMDGPUGridValues<32>();
159 }
160 if (T.isNVPTX())
162 llvm_unreachable("No grid value available for this architecture!");
163}
164
165/// Determine which scheduling algorithm to use, determined from schedule clause
166/// arguments.
167static OMPScheduleType
168getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
169 bool HasSimdModifier) {
170 // Currently, the default schedule it static.
171 switch (ClauseKind) {
172 case OMP_SCHEDULE_Default:
173 case OMP_SCHEDULE_Static:
174 return HasChunks ? OMPScheduleType::BaseStaticChunked
175 : OMPScheduleType::BaseStatic;
176 case OMP_SCHEDULE_Dynamic:
177 return OMPScheduleType::BaseDynamicChunked;
178 case OMP_SCHEDULE_Guided:
179 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
180 : OMPScheduleType::BaseGuidedChunked;
181 case OMP_SCHEDULE_Auto:
183 case OMP_SCHEDULE_Runtime:
184 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
185 : OMPScheduleType::BaseRuntime;
186 }
187 llvm_unreachable("unhandled schedule clause argument");
188}
189
190/// Adds ordering modifier flags to schedule type.
191static OMPScheduleType
193 bool HasOrderedClause) {
194 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
195 OMPScheduleType::None &&
196 "Must not have ordering nor monotonicity flags already set");
197
198 OMPScheduleType OrderingModifier = HasOrderedClause
199 ? OMPScheduleType::ModifierOrdered
200 : OMPScheduleType::ModifierUnordered;
201 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
202
203 // Unsupported combinations
204 if (OrderingScheduleType ==
205 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
206 return OMPScheduleType::OrderedGuidedChunked;
207 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
208 OMPScheduleType::ModifierOrdered))
209 return OMPScheduleType::OrderedRuntime;
210
211 return OrderingScheduleType;
212}
213
214/// Adds monotonicity modifier flags to schedule type.
215static OMPScheduleType
217 bool HasSimdModifier, bool HasMonotonic,
218 bool HasNonmonotonic, bool HasOrderedClause) {
219 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
220 OMPScheduleType::None &&
221 "Must not have monotonicity flags already set");
222 assert((!HasMonotonic || !HasNonmonotonic) &&
223 "Monotonic and Nonmonotonic are contradicting each other");
224
225 if (HasMonotonic) {
226 return ScheduleType | OMPScheduleType::ModifierMonotonic;
227 } else if (HasNonmonotonic) {
228 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
229 } else {
230 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
231 // If the static schedule kind is specified or if the ordered clause is
232 // specified, and if the nonmonotonic modifier is not specified, the
233 // effect is as if the monotonic modifier is specified. Otherwise, unless
234 // the monotonic modifier is specified, the effect is as if the
235 // nonmonotonic modifier is specified.
236 OMPScheduleType BaseScheduleType =
237 ScheduleType & ~OMPScheduleType::ModifierMask;
238 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
239 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
240 HasOrderedClause) {
241 // The monotonic is used by default in openmp runtime library, so no need
242 // to set it.
243 return ScheduleType;
244 } else {
245 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
246 }
247 }
248}
249
250/// Determine the schedule type using schedule and ordering clause arguments.
251static OMPScheduleType
252computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
253 bool HasSimdModifier, bool HasMonotonicModifier,
254 bool HasNonmonotonicModifier, bool HasOrderedClause) {
255 OMPScheduleType BaseSchedule =
256 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
257 OMPScheduleType OrderedSchedule =
258 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
260 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
261 HasNonmonotonicModifier, HasOrderedClause);
262
264 return Result;
265}
266
267/// Make \p Source branch to \p Target.
268///
269/// Handles two situations:
270/// * \p Source already has an unconditional branch.
271/// * \p Source is a degenerate block (no terminator because the BB is
272/// the current head of the IR construction).
274 if (Instruction *Term = Source->getTerminator()) {
275 auto *Br = cast<BranchInst>(Term);
276 assert(!Br->isConditional() &&
277 "BB's terminator must be an unconditional branch (or degenerate)");
278 BasicBlock *Succ = Br->getSuccessor(0);
279 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
280 Br->setSuccessor(0, Target);
281 return;
282 }
283
284 auto *NewBr = BranchInst::Create(Target, Source);
285 NewBr->setDebugLoc(DL);
286}
287
289 bool CreateBranch) {
290 assert(New->getFirstInsertionPt() == New->begin() &&
291 "Target BB must not have PHI nodes");
292
293 // Move instructions to new block.
294 BasicBlock *Old = IP.getBlock();
295 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
296
297 if (CreateBranch)
298 BranchInst::Create(New, Old);
299}
300
301void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
303 BasicBlock *Old = Builder.GetInsertBlock();
304
305 spliceBB(Builder.saveIP(), New, CreateBranch);
306 if (CreateBranch)
307 Builder.SetInsertPoint(Old->getTerminator());
308 else
309 Builder.SetInsertPoint(Old);
310
311 // SetInsertPoint also updates the Builder's debug location, but we want to
312 // keep the one the Builder was configured to use.
314}
315
318 BasicBlock *Old = IP.getBlock();
320 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
321 Old->getParent(), Old->getNextNode());
322 spliceBB(IP, New, CreateBranch);
323 New->replaceSuccessorsPhiUsesWith(Old, New);
324 return New;
325}
326
327BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
330 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
331 if (CreateBranch)
332 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
333 else
334 Builder.SetInsertPoint(Builder.GetInsertBlock());
335 // SetInsertPoint also updates the Builder's debug location, but we want to
336 // keep the one the Builder was configured to use.
338 return New;
339}
340
341BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
344 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
345 if (CreateBranch)
346 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
347 else
348 Builder.SetInsertPoint(Builder.GetInsertBlock());
349 // SetInsertPoint also updates the Builder's debug location, but we want to
350 // keep the one the Builder was configured to use.
352 return New;
353}
354
356 llvm::Twine Suffix) {
357 BasicBlock *Old = Builder.GetInsertBlock();
358 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
359}
360
361// This function creates a fake integer value and a fake use for the integer
362// value. It returns the fake value created. This is useful in modeling the
363// extra arguments to the outlined functions.
365 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
367 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
368 const Twine &Name = "", bool AsPtr = true) {
369 Builder.restoreIP(OuterAllocaIP);
370 Instruction *FakeVal;
371 AllocaInst *FakeValAddr =
372 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
373 ToBeDeleted.push_back(FakeValAddr);
374
375 if (AsPtr) {
376 FakeVal = FakeValAddr;
377 } else {
378 FakeVal =
379 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
380 ToBeDeleted.push_back(FakeVal);
381 }
382
383 // Generate a fake use of this value
384 Builder.restoreIP(InnerAllocaIP);
385 Instruction *UseFakeVal;
386 if (AsPtr) {
387 UseFakeVal =
388 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
389 } else {
390 UseFakeVal =
391 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
392 }
393 ToBeDeleted.push_back(UseFakeVal);
394 return FakeVal;
395}
396
397//===----------------------------------------------------------------------===//
398// OpenMPIRBuilderConfig
399//===----------------------------------------------------------------------===//
400
401namespace {
403/// Values for bit flags for marking which requires clauses have been used.
404enum OpenMPOffloadingRequiresDirFlags {
405 /// flag undefined.
406 OMP_REQ_UNDEFINED = 0x000,
407 /// no requires directive present.
408 OMP_REQ_NONE = 0x001,
409 /// reverse_offload clause.
410 OMP_REQ_REVERSE_OFFLOAD = 0x002,
411 /// unified_address clause.
412 OMP_REQ_UNIFIED_ADDRESS = 0x004,
413 /// unified_shared_memory clause.
414 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
415 /// dynamic_allocators clause.
416 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
417 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
418};
419
420} // anonymous namespace
421
423 : RequiresFlags(OMP_REQ_UNDEFINED) {}
424
426 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
427 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
428 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
429 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
430 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
431 RequiresFlags(OMP_REQ_UNDEFINED) {
432 if (HasRequiresReverseOffload)
433 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
434 if (HasRequiresUnifiedAddress)
435 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
436 if (HasRequiresUnifiedSharedMemory)
437 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
438 if (HasRequiresDynamicAllocators)
439 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
440}
441
443 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
444}
445
447 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
448}
449
451 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
452}
453
455 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
456}
457
459 return hasRequiresFlags() ? RequiresFlags
460 : static_cast<int64_t>(OMP_REQ_NONE);
461}
462
464 if (Value)
465 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
466 else
467 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
468}
469
471 if (Value)
472 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
473 else
474 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
475}
476
478 if (Value)
479 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
480 else
481 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
482}
483
485 if (Value)
486 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
487 else
488 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
489}
490
491//===----------------------------------------------------------------------===//
492// OpenMPIRBuilder
493//===----------------------------------------------------------------------===//
494
496 IRBuilderBase &Builder,
497 SmallVector<Value *> &ArgsVector) {
499 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
500 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
501 constexpr const size_t MaxDim = 3;
502 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
503 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
504
505 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
506
507 Value *NumTeams3D =
508 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
509 Value *NumThreads3D =
510 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
511 for (unsigned I :
512 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
513 NumTeams3D =
514 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
515 for (unsigned I :
516 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
517 NumThreads3D =
518 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
519
520 ArgsVector = {Version,
521 PointerNum,
522 KernelArgs.RTArgs.BasePointersArray,
523 KernelArgs.RTArgs.PointersArray,
524 KernelArgs.RTArgs.SizesArray,
525 KernelArgs.RTArgs.MapTypesArray,
526 KernelArgs.RTArgs.MapNamesArray,
527 KernelArgs.RTArgs.MappersArray,
528 KernelArgs.NumIterations,
529 Flags,
530 NumTeams3D,
531 NumThreads3D,
532 KernelArgs.DynCGGroupMem};
533}
534
536 LLVMContext &Ctx = Fn.getContext();
537
538 // Get the function's current attributes.
539 auto Attrs = Fn.getAttributes();
540 auto FnAttrs = Attrs.getFnAttrs();
541 auto RetAttrs = Attrs.getRetAttrs();
543 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
544 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
545
546 // Add AS to FnAS while taking special care with integer extensions.
547 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
548 bool Param = true) -> void {
549 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
550 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
551 if (HasSignExt || HasZeroExt) {
552 assert(AS.getNumAttributes() == 1 &&
553 "Currently not handling extension attr combined with others.");
554 if (Param) {
555 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
556 FnAS = FnAS.addAttribute(Ctx, AK);
557 } else if (auto AK =
558 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
559 FnAS = FnAS.addAttribute(Ctx, AK);
560 } else {
561 FnAS = FnAS.addAttributes(Ctx, AS);
562 }
563 };
564
565#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
566#include "llvm/Frontend/OpenMP/OMPKinds.def"
567
568 // Add attributes to the function declaration.
569 switch (FnID) {
570#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
571 case Enum: \
572 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
573 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
574 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
575 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
576 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
577 break;
578#include "llvm/Frontend/OpenMP/OMPKinds.def"
579 default:
580 // Attributes are optional.
581 break;
582 }
583}
584
587 FunctionType *FnTy = nullptr;
588 Function *Fn = nullptr;
589
590 // Try to find the declation in the module first.
591 switch (FnID) {
592#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
593 case Enum: \
594 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
595 IsVarArg); \
596 Fn = M.getFunction(Str); \
597 break;
598#include "llvm/Frontend/OpenMP/OMPKinds.def"
599 }
600
601 if (!Fn) {
602 // Create a new declaration if we need one.
603 switch (FnID) {
604#define OMP_RTL(Enum, Str, ...) \
605 case Enum: \
606 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
607 break;
608#include "llvm/Frontend/OpenMP/OMPKinds.def"
609 }
610
611 // Add information if the runtime function takes a callback function
612 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
613 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
614 LLVMContext &Ctx = Fn->getContext();
615 MDBuilder MDB(Ctx);
616 // Annotate the callback behavior of the runtime function:
617 // - The callback callee is argument number 2 (microtask).
618 // - The first two arguments of the callback callee are unknown (-1).
619 // - All variadic arguments to the runtime function are passed to the
620 // callback callee.
621 Fn->addMetadata(
622 LLVMContext::MD_callback,
624 2, {-1, -1}, /* VarArgsArePassed */ true)}));
625 }
626 }
627
628 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
629 << " with type " << *Fn->getFunctionType() << "\n");
630 addAttributes(FnID, *Fn);
631
632 } else {
633 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
634 << " with type " << *Fn->getFunctionType() << "\n");
635 }
636
637 assert(Fn && "Failed to create OpenMP runtime function");
638
639 return {FnTy, Fn};
640}
641
644 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
645 assert(Fn && "Failed to create OpenMP runtime function pointer");
646 return Fn;
647}
648
649void OpenMPIRBuilder::initialize() { initializeTypes(M); }
650
653 BasicBlock &EntryBlock = Function->getEntryBlock();
654 Instruction *MoveLocInst = EntryBlock.getFirstNonPHI();
655
656 // Loop over blocks looking for constant allocas, skipping the entry block
657 // as any allocas there are already in the desired location.
658 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
659 Block++) {
660 for (auto Inst = Block->getReverseIterator()->begin();
661 Inst != Block->getReverseIterator()->end();) {
662 if (auto *AllocaInst = dyn_cast_if_present<llvm::AllocaInst>(Inst)) {
663 Inst++;
664 if (!isa<ConstantData>(AllocaInst->getArraySize()))
665 continue;
666 AllocaInst->moveBeforePreserving(MoveLocInst);
667 } else {
668 Inst++;
669 }
670 }
671 }
672}
673
675 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
677 SmallVector<OutlineInfo, 16> DeferredOutlines;
678 for (OutlineInfo &OI : OutlineInfos) {
679 // Skip functions that have not finalized yet; may happen with nested
680 // function generation.
681 if (Fn && OI.getFunction() != Fn) {
682 DeferredOutlines.push_back(OI);
683 continue;
684 }
685
686 ParallelRegionBlockSet.clear();
687 Blocks.clear();
688 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
689
690 Function *OuterFn = OI.getFunction();
691 CodeExtractorAnalysisCache CEAC(*OuterFn);
692 // If we generate code for the target device, we need to allocate
693 // struct for aggregate params in the device default alloca address space.
694 // OpenMP runtime requires that the params of the extracted functions are
695 // passed as zero address space pointers. This flag ensures that
696 // CodeExtractor generates correct code for extracted functions
697 // which are used by OpenMP runtime.
698 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
699 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
700 /* AggregateArgs */ true,
701 /* BlockFrequencyInfo */ nullptr,
702 /* BranchProbabilityInfo */ nullptr,
703 /* AssumptionCache */ nullptr,
704 /* AllowVarArgs */ true,
705 /* AllowAlloca */ true,
706 /* AllocaBlock*/ OI.OuterAllocaBB,
707 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
708
709 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
710 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
711 << " Exit: " << OI.ExitBB->getName() << "\n");
712 assert(Extractor.isEligible() &&
713 "Expected OpenMP outlining to be possible!");
714
715 for (auto *V : OI.ExcludeArgsFromAggregate)
716 Extractor.excludeArgFromAggregate(V);
717
718 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
719
720 // Forward target-cpu, target-features attributes to the outlined function.
721 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
722 if (TargetCpuAttr.isStringAttribute())
723 OutlinedFn->addFnAttr(TargetCpuAttr);
724
725 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
726 if (TargetFeaturesAttr.isStringAttribute())
727 OutlinedFn->addFnAttr(TargetFeaturesAttr);
728
729 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
730 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
731 assert(OutlinedFn->getReturnType()->isVoidTy() &&
732 "OpenMP outlined functions should not return a value!");
733
734 // For compability with the clang CG we move the outlined function after the
735 // one with the parallel region.
736 OutlinedFn->removeFromParent();
737 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
738
739 // Remove the artificial entry introduced by the extractor right away, we
740 // made our own entry block after all.
741 {
742 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
743 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
744 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
745 // Move instructions from the to-be-deleted ArtificialEntry to the entry
746 // basic block of the parallel region. CodeExtractor generates
747 // instructions to unwrap the aggregate argument and may sink
748 // allocas/bitcasts for values that are solely used in the outlined region
749 // and do not escape.
750 assert(!ArtificialEntry.empty() &&
751 "Expected instructions to add in the outlined region entry");
752 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
753 End = ArtificialEntry.rend();
754 It != End;) {
755 Instruction &I = *It;
756 It++;
757
758 if (I.isTerminator())
759 continue;
760
761 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
762 }
763
764 OI.EntryBB->moveBefore(&ArtificialEntry);
765 ArtificialEntry.eraseFromParent();
766 }
767 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
768 assert(OutlinedFn && OutlinedFn->getNumUses() == 1);
769
770 // Run a user callback, e.g. to add attributes.
771 if (OI.PostOutlineCB)
772 OI.PostOutlineCB(*OutlinedFn);
773 }
774
775 // Remove work items that have been completed.
776 OutlineInfos = std::move(DeferredOutlines);
777
778 // The createTarget functions embeds user written code into
779 // the target region which may inject allocas which need to
780 // be moved to the entry block of our target or risk malformed
781 // optimisations by later passes, this is only relevant for
782 // the device pass which appears to be a little more delicate
783 // when it comes to optimisations (however, we do not block on
784 // that here, it's up to the inserter to the list to do so).
785 // This notbaly has to occur after the OutlinedInfo candidates
786 // have been extracted so we have an end product that will not
787 // be implicitly adversely affected by any raises unless
788 // intentionally appended to the list.
789 // NOTE: This only does so for ConstantData, it could be extended
790 // to ConstantExpr's with further effort, however, they should
791 // largely be folded when they get here. Extending it to runtime
792 // defined/read+writeable allocation sizes would be non-trivial
793 // (need to factor in movement of any stores to variables the
794 // allocation size depends on, as well as the usual loads,
795 // otherwise it'll yield the wrong result after movement) and
796 // likely be more suitable as an LLVM optimisation pass.
799
800 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
801 [](EmitMetadataErrorKind Kind,
802 const TargetRegionEntryInfo &EntryInfo) -> void {
803 errs() << "Error of kind: " << Kind
804 << " when emitting offload entries and metadata during "
805 "OMPIRBuilder finalization \n";
806 };
807
810
811 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
812 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
813 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
814 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
815 }
816}
817
819 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
820}
821
824 auto *GV =
825 new GlobalVariable(M, I32Ty,
826 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
827 ConstantInt::get(I32Ty, Value), Name);
828 GV->setVisibility(GlobalValue::HiddenVisibility);
829
830 return GV;
831}
832
834 uint32_t SrcLocStrSize,
835 IdentFlag LocFlags,
836 unsigned Reserve2Flags) {
837 // Enable "C-mode".
838 LocFlags |= OMP_IDENT_FLAG_KMPC;
839
840 Constant *&Ident =
841 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
842 if (!Ident) {
844 Constant *IdentData[] = {I32Null,
845 ConstantInt::get(Int32, uint32_t(LocFlags)),
846 ConstantInt::get(Int32, Reserve2Flags),
847 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
848 Constant *Initializer =
849 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
850
851 // Look for existing encoding of the location + flags, not needed but
852 // minimizes the difference to the existing solution while we transition.
853 for (GlobalVariable &GV : M.globals())
854 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
855 if (GV.getInitializer() == Initializer)
856 Ident = &GV;
857
858 if (!Ident) {
859 auto *GV = new GlobalVariable(
860 M, OpenMPIRBuilder::Ident,
861 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
864 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
865 GV->setAlignment(Align(8));
866 Ident = GV;
867 }
868 }
869
871}
872
874 uint32_t &SrcLocStrSize) {
875 SrcLocStrSize = LocStr.size();
876 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
877 if (!SrcLocStr) {
878 Constant *Initializer =
880
881 // Look for existing encoding of the location, not needed but minimizes the
882 // difference to the existing solution while we transition.
883 for (GlobalVariable &GV : M.globals())
884 if (GV.isConstant() && GV.hasInitializer() &&
885 GV.getInitializer() == Initializer)
886 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
887
888 SrcLocStr = Builder.CreateGlobalString(LocStr, /* Name */ "",
889 /* AddressSpace */ 0, &M);
890 }
891 return SrcLocStr;
892}
893
895 StringRef FileName,
896 unsigned Line, unsigned Column,
897 uint32_t &SrcLocStrSize) {
898 SmallString<128> Buffer;
899 Buffer.push_back(';');
900 Buffer.append(FileName);
901 Buffer.push_back(';');
902 Buffer.append(FunctionName);
903 Buffer.push_back(';');
904 Buffer.append(std::to_string(Line));
905 Buffer.push_back(';');
906 Buffer.append(std::to_string(Column));
907 Buffer.push_back(';');
908 Buffer.push_back(';');
909 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
910}
911
912Constant *
914 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
915 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
916}
917
919 uint32_t &SrcLocStrSize,
920 Function *F) {
921 DILocation *DIL = DL.get();
922 if (!DIL)
923 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
924 StringRef FileName = M.getName();
925 if (DIFile *DIF = DIL->getFile())
926 if (std::optional<StringRef> Source = DIF->getSource())
927 FileName = *Source;
928 StringRef Function = DIL->getScope()->getSubprogram()->getName();
929 if (Function.empty() && F)
930 Function = F->getName();
931 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
932 DIL->getColumn(), SrcLocStrSize);
933}
934
936 uint32_t &SrcLocStrSize) {
937 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
938 Loc.IP.getBlock()->getParent());
939}
940
942 return Builder.CreateCall(
943 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
944 "omp_global_thread_num");
945}
946
949 bool ForceSimpleCall, bool CheckCancelFlag) {
950 if (!updateToLocation(Loc))
951 return Loc.IP;
952
953 // Build call __kmpc_cancel_barrier(loc, thread_id) or
954 // __kmpc_barrier(loc, thread_id);
955
956 IdentFlag BarrierLocFlags;
957 switch (Kind) {
958 case OMPD_for:
959 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
960 break;
961 case OMPD_sections:
962 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
963 break;
964 case OMPD_single:
965 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
966 break;
967 case OMPD_barrier:
968 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
969 break;
970 default:
971 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
972 break;
973 }
974
975 uint32_t SrcLocStrSize;
976 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
977 Value *Args[] = {
978 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
979 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
980
981 // If we are in a cancellable parallel region, barriers are cancellation
982 // points.
983 // TODO: Check why we would force simple calls or to ignore the cancel flag.
984 bool UseCancelBarrier =
985 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
986
987 Value *Result =
989 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
990 : OMPRTL___kmpc_barrier),
991 Args);
992
993 if (UseCancelBarrier && CheckCancelFlag)
994 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
995 return Err;
996
997 return Builder.saveIP();
998}
999
1002 Value *IfCondition,
1003 omp::Directive CanceledDirective) {
1004 if (!updateToLocation(Loc))
1005 return Loc.IP;
1006
1007 // LLVM utilities like blocks with terminators.
1008 auto *UI = Builder.CreateUnreachable();
1009
1010 Instruction *ThenTI = UI, *ElseTI = nullptr;
1011 if (IfCondition)
1012 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1013 Builder.SetInsertPoint(ThenTI);
1014
1015 Value *CancelKind = nullptr;
1016 switch (CanceledDirective) {
1017#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1018 case DirectiveEnum: \
1019 CancelKind = Builder.getInt32(Value); \
1020 break;
1021#include "llvm/Frontend/OpenMP/OMPKinds.def"
1022 default:
1023 llvm_unreachable("Unknown cancel kind!");
1024 }
1025
1026 uint32_t SrcLocStrSize;
1027 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1028 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1029 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1030 Value *Result = Builder.CreateCall(
1031 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1032 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1033 if (CanceledDirective == OMPD_parallel) {
1035 Builder.restoreIP(IP);
1037 omp::Directive::OMPD_unknown,
1038 /* ForceSimpleCall */ false,
1039 /* CheckCancelFlag */ false)
1040 .takeError();
1041 }
1042 return Error::success();
1043 };
1044
1045 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1046 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1047 return Err;
1048
1049 // Update the insertion point and remove the terminator we introduced.
1050 Builder.SetInsertPoint(UI->getParent());
1051 UI->eraseFromParent();
1052
1053 return Builder.saveIP();
1054}
1055
1057 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1058 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1059 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1060 if (!updateToLocation(Loc))
1061 return Loc.IP;
1062
1063 Builder.restoreIP(AllocaIP);
1064 auto *KernelArgsPtr =
1065 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1066 Builder.restoreIP(Loc.IP);
1067
1068 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1069 llvm::Value *Arg =
1070 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1072 KernelArgs[I], Arg,
1073 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1074 }
1075
1076 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1077 NumThreads, HostPtr, KernelArgsPtr};
1078
1079 Return = Builder.CreateCall(
1080 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1081 OffloadingArgs);
1082
1083 return Builder.saveIP();
1084}
1085
1087 const LocationDescription &Loc, Value *OutlinedFnID,
1088 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1089 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1090
1091 if (!updateToLocation(Loc))
1092 return Loc.IP;
1093
1094 Builder.restoreIP(Loc.IP);
1095 // On top of the arrays that were filled up, the target offloading call
1096 // takes as arguments the device id as well as the host pointer. The host
1097 // pointer is used by the runtime library to identify the current target
1098 // region, so it only has to be unique and not necessarily point to
1099 // anything. It could be the pointer to the outlined function that
1100 // implements the target region, but we aren't using that so that the
1101 // compiler doesn't need to keep that, and could therefore inline the host
1102 // function if proven worthwhile during optimization.
1103
1104 // From this point on, we need to have an ID of the target region defined.
1105 assert(OutlinedFnID && "Invalid outlined function ID!");
1106 (void)OutlinedFnID;
1107
1108 // Return value of the runtime offloading call.
1109 Value *Return = nullptr;
1110
1111 // Arguments for the target kernel.
1112 SmallVector<Value *> ArgsVector;
1113 getKernelArgsVector(Args, Builder, ArgsVector);
1114
1115 // The target region is an outlined function launched by the runtime
1116 // via calls to __tgt_target_kernel().
1117 //
1118 // Note that on the host and CPU targets, the runtime implementation of
1119 // these calls simply call the outlined function without forking threads.
1120 // The outlined functions themselves have runtime calls to
1121 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1122 // the compiler in emitTeamsCall() and emitParallelCall().
1123 //
1124 // In contrast, on the NVPTX target, the implementation of
1125 // __tgt_target_teams() launches a GPU kernel with the requested number
1126 // of teams and threads so no additional calls to the runtime are required.
1127 // Check the error code and execute the host version if required.
1129 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1130 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1131
1132 BasicBlock *OffloadFailedBlock =
1133 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1134 BasicBlock *OffloadContBlock =
1135 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1137 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1138
1139 auto CurFn = Builder.GetInsertBlock()->getParent();
1140 emitBlock(OffloadFailedBlock, CurFn);
1141 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1142 if (!AfterIP)
1143 return AfterIP.takeError();
1144 Builder.restoreIP(*AfterIP);
1145 emitBranch(OffloadContBlock);
1146 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1147 return Builder.saveIP();
1148}
1149
1151 Value *CancelFlag, omp::Directive CanceledDirective,
1152 FinalizeCallbackTy ExitCB) {
1153 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1154 "Unexpected cancellation!");
1155
1156 // For a cancel barrier we create two new blocks.
1158 BasicBlock *NonCancellationBlock;
1159 if (Builder.GetInsertPoint() == BB->end()) {
1160 // TODO: This branch will not be needed once we moved to the
1161 // OpenMPIRBuilder codegen completely.
1162 NonCancellationBlock = BasicBlock::Create(
1163 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1164 } else {
1165 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1168 }
1169 BasicBlock *CancellationBlock = BasicBlock::Create(
1170 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1171
1172 // Jump to them based on the return value.
1173 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1174 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1175 /* TODO weight */ nullptr, nullptr);
1176
1177 // From the cancellation block we finalize all variables and go to the
1178 // post finalization block that is known to the FiniCB callback.
1179 Builder.SetInsertPoint(CancellationBlock);
1180 if (ExitCB)
1181 if (Error Err = ExitCB(Builder.saveIP()))
1182 return Err;
1183 auto &FI = FinalizationStack.back();
1184 if (Error Err = FI.FiniCB(Builder.saveIP()))
1185 return Err;
1186
1187 // The continuation block is where code generation continues.
1188 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1189 return Error::success();
1190}
1191
1192// Callback used to create OpenMP runtime calls to support
1193// omp parallel clause for the device.
1194// We need to use this callback to replace call to the OutlinedFn in OuterFn
1195// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1197 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1198 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1199 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1200 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1201 // Add some known attributes.
1202 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1203 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1204 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1205 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1206 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1207 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1208
1209 assert(OutlinedFn.arg_size() >= 2 &&
1210 "Expected at least tid and bounded tid as arguments");
1211 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1212
1213 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1214 assert(CI && "Expected call instruction to outlined function");
1215 CI->getParent()->setName("omp_parallel");
1216
1217 Builder.SetInsertPoint(CI);
1218 Type *PtrTy = OMPIRBuilder->VoidPtr;
1219 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1220
1221 // Add alloca for kernel args
1222 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1223 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1224 AllocaInst *ArgsAlloca =
1225 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1226 Value *Args = ArgsAlloca;
1227 // Add address space cast if array for storing arguments is not allocated
1228 // in address space 0
1229 if (ArgsAlloca->getAddressSpace())
1230 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1231 Builder.restoreIP(CurrentIP);
1232
1233 // Store captured vars which are used by kmpc_parallel_51
1234 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1235 Value *V = *(CI->arg_begin() + 2 + Idx);
1236 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1237 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1238 Builder.CreateStore(V, StoreAddress);
1239 }
1240
1241 Value *Cond =
1242 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1243 : Builder.getInt32(1);
1244
1245 // Build kmpc_parallel_51 call
1246 Value *Parallel51CallArgs[] = {
1247 /* identifier*/ Ident,
1248 /* global thread num*/ ThreadID,
1249 /* if expression */ Cond,
1250 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1251 /* Proc bind */ Builder.getInt32(-1),
1252 /* outlined function */
1253 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr),
1254 /* wrapper function */ NullPtrValue,
1255 /* arguments of the outlined funciton*/ Args,
1256 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1257
1258 FunctionCallee RTLFn =
1259 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1260
1261 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1262
1263 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1264 << *Builder.GetInsertBlock()->getParent() << "\n");
1265
1266 // Initialize the local TID stack location with the argument value.
1267 Builder.SetInsertPoint(PrivTID);
1268 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1269 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1270 PrivTIDAddr);
1271
1272 // Remove redundant call to the outlined function.
1273 CI->eraseFromParent();
1274
1275 for (Instruction *I : ToBeDeleted) {
1276 I->eraseFromParent();
1277 }
1278}
1279
1280// Callback used to create OpenMP runtime calls to support
1281// omp parallel clause for the host.
1282// We need to use this callback to replace call to the OutlinedFn in OuterFn
1283// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1284static void
1286 Function *OuterFn, Value *Ident, Value *IfCondition,
1287 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1288 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1289 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1290 FunctionCallee RTLFn;
1291 if (IfCondition) {
1292 RTLFn =
1293 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1294 } else {
1295 RTLFn =
1296 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1297 }
1298 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1299 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1300 LLVMContext &Ctx = F->getContext();
1301 MDBuilder MDB(Ctx);
1302 // Annotate the callback behavior of the __kmpc_fork_call:
1303 // - The callback callee is argument number 2 (microtask).
1304 // - The first two arguments of the callback callee are unknown (-1).
1305 // - All variadic arguments to the __kmpc_fork_call are passed to the
1306 // callback callee.
1307 F->addMetadata(LLVMContext::MD_callback,
1309 2, {-1, -1},
1310 /* VarArgsArePassed */ true)}));
1311 }
1312 }
1313 // Add some known attributes.
1314 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1315 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1316 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1317
1318 assert(OutlinedFn.arg_size() >= 2 &&
1319 "Expected at least tid and bounded tid as arguments");
1320 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1321
1322 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1323 CI->getParent()->setName("omp_parallel");
1324 Builder.SetInsertPoint(CI);
1325
1326 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1327 Value *ForkCallArgs[] = {
1328 Ident, Builder.getInt32(NumCapturedVars),
1329 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr)};
1330
1331 SmallVector<Value *, 16> RealArgs;
1332 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1333 if (IfCondition) {
1334 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1335 RealArgs.push_back(Cond);
1336 }
1337 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1338
1339 // __kmpc_fork_call_if always expects a void ptr as the last argument
1340 // If there are no arguments, pass a null pointer.
1341 auto PtrTy = OMPIRBuilder->VoidPtr;
1342 if (IfCondition && NumCapturedVars == 0) {
1343 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1344 RealArgs.push_back(NullPtrValue);
1345 }
1346 if (IfCondition && RealArgs.back()->getType() != PtrTy)
1347 RealArgs.back() = Builder.CreateBitCast(RealArgs.back(), PtrTy);
1348
1349 Builder.CreateCall(RTLFn, RealArgs);
1350
1351 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1352 << *Builder.GetInsertBlock()->getParent() << "\n");
1353
1354 // Initialize the local TID stack location with the argument value.
1355 Builder.SetInsertPoint(PrivTID);
1356 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1357 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1358 PrivTIDAddr);
1359
1360 // Remove redundant call to the outlined function.
1361 CI->eraseFromParent();
1362
1363 for (Instruction *I : ToBeDeleted) {
1364 I->eraseFromParent();
1365 }
1366}
1367
1369 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1370 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1371 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1372 omp::ProcBindKind ProcBind, bool IsCancellable) {
1373 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1374
1375 if (!updateToLocation(Loc))
1376 return Loc.IP;
1377
1378 uint32_t SrcLocStrSize;
1379 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1380 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1381 Value *ThreadID = getOrCreateThreadID(Ident);
1382 // If we generate code for the target device, we need to allocate
1383 // struct for aggregate params in the device default alloca address space.
1384 // OpenMP runtime requires that the params of the extracted functions are
1385 // passed as zero address space pointers. This flag ensures that extracted
1386 // function arguments are declared in zero address space
1387 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1388
1389 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1390 // only if we compile for host side.
1391 if (NumThreads && !Config.isTargetDevice()) {
1392 Value *Args[] = {
1393 Ident, ThreadID,
1394 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1396 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1397 }
1398
1399 if (ProcBind != OMP_PROC_BIND_default) {
1400 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1401 Value *Args[] = {
1402 Ident, ThreadID,
1403 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1405 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1406 }
1407
1408 BasicBlock *InsertBB = Builder.GetInsertBlock();
1409 Function *OuterFn = InsertBB->getParent();
1410
1411 // Save the outer alloca block because the insertion iterator may get
1412 // invalidated and we still need this later.
1413 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1414
1415 // Vector to remember instructions we used only during the modeling but which
1416 // we want to delete at the end.
1418
1419 // Change the location to the outer alloca insertion point to create and
1420 // initialize the allocas we pass into the parallel region.
1421 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1422 Builder.restoreIP(NewOuter);
1423 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1424 AllocaInst *ZeroAddrAlloca =
1425 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1426 Instruction *TIDAddr = TIDAddrAlloca;
1427 Instruction *ZeroAddr = ZeroAddrAlloca;
1428 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1429 // Add additional casts to enforce pointers in zero address space
1430 TIDAddr = new AddrSpaceCastInst(
1431 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1432 TIDAddr->insertAfter(TIDAddrAlloca);
1433 ToBeDeleted.push_back(TIDAddr);
1434 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1435 PointerType ::get(M.getContext(), 0),
1436 "zero.addr.ascast");
1437 ZeroAddr->insertAfter(ZeroAddrAlloca);
1438 ToBeDeleted.push_back(ZeroAddr);
1439 }
1440
1441 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1442 // associated arguments in the outlined function, so we delete them later.
1443 ToBeDeleted.push_back(TIDAddrAlloca);
1444 ToBeDeleted.push_back(ZeroAddrAlloca);
1445
1446 // Create an artificial insertion point that will also ensure the blocks we
1447 // are about to split are not degenerated.
1448 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1449
1450 BasicBlock *EntryBB = UI->getParent();
1451 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1452 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1453 BasicBlock *PRegPreFiniBB =
1454 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1455 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1456
1457 auto FiniCBWrapper = [&](InsertPointTy IP) {
1458 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1459 // target to the region exit block.
1460 if (IP.getBlock()->end() == IP.getPoint()) {
1462 Builder.restoreIP(IP);
1463 Instruction *I = Builder.CreateBr(PRegExitBB);
1464 IP = InsertPointTy(I->getParent(), I->getIterator());
1465 }
1466 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1467 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1468 "Unexpected insertion point for finalization call!");
1469 return FiniCB(IP);
1470 };
1471
1472 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1473
1474 // Generate the privatization allocas in the block that will become the entry
1475 // of the outlined function.
1476 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1477 InsertPointTy InnerAllocaIP = Builder.saveIP();
1478
1479 AllocaInst *PrivTIDAddr =
1480 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1481 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1482
1483 // Add some fake uses for OpenMP provided arguments.
1484 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1485 Instruction *ZeroAddrUse =
1486 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1487 ToBeDeleted.push_back(ZeroAddrUse);
1488
1489 // EntryBB
1490 // |
1491 // V
1492 // PRegionEntryBB <- Privatization allocas are placed here.
1493 // |
1494 // V
1495 // PRegionBodyBB <- BodeGen is invoked here.
1496 // |
1497 // V
1498 // PRegPreFiniBB <- The block we will start finalization from.
1499 // |
1500 // V
1501 // PRegionExitBB <- A common exit to simplify block collection.
1502 //
1503
1504 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1505
1506 // Let the caller create the body.
1507 assert(BodyGenCB && "Expected body generation callback!");
1508 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1509 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1510 return Err;
1511
1512 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1513
1514 OutlineInfo OI;
1515 if (Config.isTargetDevice()) {
1516 // Generate OpenMP target specific runtime call
1517 OI.PostOutlineCB = [=, ToBeDeletedVec =
1518 std::move(ToBeDeleted)](Function &OutlinedFn) {
1519 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1520 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1521 ThreadID, ToBeDeletedVec);
1522 };
1523 } else {
1524 // Generate OpenMP host runtime call
1525 OI.PostOutlineCB = [=, ToBeDeletedVec =
1526 std::move(ToBeDeleted)](Function &OutlinedFn) {
1527 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1528 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1529 };
1530 }
1531
1532 OI.OuterAllocaBB = OuterAllocaBlock;
1533 OI.EntryBB = PRegEntryBB;
1534 OI.ExitBB = PRegExitBB;
1535
1536 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1538 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1539
1540 // Ensure a single exit node for the outlined region by creating one.
1541 // We might have multiple incoming edges to the exit now due to finalizations,
1542 // e.g., cancel calls that cause the control flow to leave the region.
1543 BasicBlock *PRegOutlinedExitBB = PRegExitBB;
1544 PRegExitBB = SplitBlock(PRegExitBB, &*PRegExitBB->getFirstInsertionPt());
1545 PRegOutlinedExitBB->setName("omp.par.outlined.exit");
1546 Blocks.push_back(PRegOutlinedExitBB);
1547
1548 CodeExtractorAnalysisCache CEAC(*OuterFn);
1549 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1550 /* AggregateArgs */ false,
1551 /* BlockFrequencyInfo */ nullptr,
1552 /* BranchProbabilityInfo */ nullptr,
1553 /* AssumptionCache */ nullptr,
1554 /* AllowVarArgs */ true,
1555 /* AllowAlloca */ true,
1556 /* AllocationBlock */ OuterAllocaBlock,
1557 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1558
1559 // Find inputs to, outputs from the code region.
1560 BasicBlock *CommonExit = nullptr;
1561 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1562 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1563
1564 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1565 /*CollectGlobalInputs=*/true);
1566
1567 Inputs.remove_if([&](Value *I) {
1568 if (auto *GV = dyn_cast_if_present<GlobalVariable>(I))
1569 return GV->getValueType() == OpenMPIRBuilder::Ident;
1570
1571 return false;
1572 });
1573
1574 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1575
1576 FunctionCallee TIDRTLFn =
1577 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1578
1579 auto PrivHelper = [&](Value &V) -> Error {
1580 if (&V == TIDAddr || &V == ZeroAddr) {
1581 OI.ExcludeArgsFromAggregate.push_back(&V);
1582 return Error::success();
1583 }
1584
1586 for (Use &U : V.uses())
1587 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1588 if (ParallelRegionBlockSet.count(UserI->getParent()))
1589 Uses.insert(&U);
1590
1591 // __kmpc_fork_call expects extra arguments as pointers. If the input
1592 // already has a pointer type, everything is fine. Otherwise, store the
1593 // value onto stack and load it back inside the to-be-outlined region. This
1594 // will ensure only the pointer will be passed to the function.
1595 // FIXME: if there are more than 15 trailing arguments, they must be
1596 // additionally packed in a struct.
1597 Value *Inner = &V;
1598 if (!V.getType()->isPointerTy()) {
1600 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1601
1602 Builder.restoreIP(OuterAllocaIP);
1603 Value *Ptr =
1604 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1605
1606 // Store to stack at end of the block that currently branches to the entry
1607 // block of the to-be-outlined region.
1608 Builder.SetInsertPoint(InsertBB,
1609 InsertBB->getTerminator()->getIterator());
1610 Builder.CreateStore(&V, Ptr);
1611
1612 // Load back next to allocations in the to-be-outlined region.
1613 Builder.restoreIP(InnerAllocaIP);
1614 Inner = Builder.CreateLoad(V.getType(), Ptr);
1615 }
1616
1617 Value *ReplacementValue = nullptr;
1618 CallInst *CI = dyn_cast<CallInst>(&V);
1619 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1620 ReplacementValue = PrivTID;
1621 } else {
1622 InsertPointOrErrorTy AfterIP =
1623 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1624 if (!AfterIP)
1625 return AfterIP.takeError();
1626 Builder.restoreIP(*AfterIP);
1627 InnerAllocaIP = {
1628 InnerAllocaIP.getBlock(),
1629 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1630
1631 assert(ReplacementValue &&
1632 "Expected copy/create callback to set replacement value!");
1633 if (ReplacementValue == &V)
1634 return Error::success();
1635 }
1636
1637 for (Use *UPtr : Uses)
1638 UPtr->set(ReplacementValue);
1639
1640 return Error::success();
1641 };
1642
1643 // Reset the inner alloca insertion as it will be used for loading the values
1644 // wrapped into pointers before passing them into the to-be-outlined region.
1645 // Configure it to insert immediately after the fake use of zero address so
1646 // that they are available in the generated body and so that the
1647 // OpenMP-related values (thread ID and zero address pointers) remain leading
1648 // in the argument list.
1649 InnerAllocaIP = IRBuilder<>::InsertPoint(
1650 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1651
1652 // Reset the outer alloca insertion point to the entry of the relevant block
1653 // in case it was invalidated.
1654 OuterAllocaIP = IRBuilder<>::InsertPoint(
1655 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1656
1657 for (Value *Input : Inputs) {
1658 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1659 if (Error Err = PrivHelper(*Input))
1660 return Err;
1661 }
1662 LLVM_DEBUG({
1663 for (Value *Output : Outputs)
1664 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1665 });
1666 assert(Outputs.empty() &&
1667 "OpenMP outlining should not produce live-out values!");
1668
1669 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1670 LLVM_DEBUG({
1671 for (auto *BB : Blocks)
1672 dbgs() << " PBR: " << BB->getName() << "\n";
1673 });
1674
1675 // Adjust the finalization stack, verify the adjustment, and call the
1676 // finalize function a last time to finalize values between the pre-fini
1677 // block and the exit block if we left the parallel "the normal way".
1678 auto FiniInfo = FinalizationStack.pop_back_val();
1679 (void)FiniInfo;
1680 assert(FiniInfo.DK == OMPD_parallel &&
1681 "Unexpected finalization stack state!");
1682
1683 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1684
1685 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1686 if (Error Err = FiniCB(PreFiniIP))
1687 return Err;
1688
1689 // Register the outlined info.
1690 addOutlineInfo(std::move(OI));
1691
1692 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1693 UI->eraseFromParent();
1694
1695 return AfterIP;
1696}
1697
1699 // Build call void __kmpc_flush(ident_t *loc)
1700 uint32_t SrcLocStrSize;
1701 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1702 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1703
1704 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1705}
1706
1708 if (!updateToLocation(Loc))
1709 return;
1710 emitFlush(Loc);
1711}
1712
1714 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1715 // global_tid);
1716 uint32_t SrcLocStrSize;
1717 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1718 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1719 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1720
1721 // Ignore return result until untied tasks are supported.
1722 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1723 Args);
1724}
1725
1727 if (!updateToLocation(Loc))
1728 return;
1729 emitTaskwaitImpl(Loc);
1730}
1731
1733 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1734 uint32_t SrcLocStrSize;
1735 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1736 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1738 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1739
1740 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1741 Args);
1742}
1743
1745 if (!updateToLocation(Loc))
1746 return;
1747 emitTaskyieldImpl(Loc);
1748}
1749
1750// Processes the dependencies in Dependencies and does the following
1751// - Allocates space on the stack of an array of DependInfo objects
1752// - Populates each DependInfo object with relevant information of
1753// the corresponding dependence.
1754// - All code is inserted in the entry block of the current function.
1756 OpenMPIRBuilder &OMPBuilder,
1758 // Early return if we have no dependencies to process
1759 if (Dependencies.empty())
1760 return nullptr;
1761
1762 // Given a vector of DependData objects, in this function we create an
1763 // array on the stack that holds kmp_dep_info objects corresponding
1764 // to each dependency. This is then passed to the OpenMP runtime.
1765 // For example, if there are 'n' dependencies then the following psedo
1766 // code is generated. Assume the first dependence is on a variable 'a'
1767 //
1768 // \code{c}
1769 // DepArray = alloc(n x sizeof(kmp_depend_info);
1770 // idx = 0;
1771 // DepArray[idx].base_addr = ptrtoint(&a);
1772 // DepArray[idx].len = 8;
1773 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1774 // ++idx;
1775 // DepArray[idx].base_addr = ...;
1776 // \endcode
1777
1778 IRBuilderBase &Builder = OMPBuilder.Builder;
1779 Type *DependInfo = OMPBuilder.DependInfo;
1780 Module &M = OMPBuilder.M;
1781
1782 Value *DepArray = nullptr;
1783 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1784 Builder.SetInsertPoint(
1786
1787 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1788 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1789
1790 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
1791 Value *Base =
1792 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
1793 // Store the pointer to the variable
1794 Value *Addr = Builder.CreateStructGEP(
1795 DependInfo, Base,
1796 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1797 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1798 Builder.CreateStore(DepValPtr, Addr);
1799 // Store the size of the variable
1800 Value *Size = Builder.CreateStructGEP(
1801 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
1802 Builder.CreateStore(
1803 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1804 Size);
1805 // Store the dependency kind
1806 Value *Flags = Builder.CreateStructGEP(
1807 DependInfo, Base,
1808 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1809 Builder.CreateStore(
1810 ConstantInt::get(Builder.getInt8Ty(),
1811 static_cast<unsigned int>(Dep.DepKind)),
1812 Flags);
1813 }
1814 Builder.restoreIP(OldIP);
1815 return DepArray;
1816}
1817
1819 const LocationDescription &Loc, InsertPointTy AllocaIP,
1820 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
1821 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle) {
1822
1823 if (!updateToLocation(Loc))
1824 return InsertPointTy();
1825
1826 uint32_t SrcLocStrSize;
1827 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1828 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1829 // The current basic block is split into four basic blocks. After outlining,
1830 // they will be mapped as follows:
1831 // ```
1832 // def current_fn() {
1833 // current_basic_block:
1834 // br label %task.exit
1835 // task.exit:
1836 // ; instructions after task
1837 // }
1838 // def outlined_fn() {
1839 // task.alloca:
1840 // br label %task.body
1841 // task.body:
1842 // ret void
1843 // }
1844 // ```
1845 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1846 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1847 BasicBlock *TaskAllocaBB =
1848 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1849
1850 InsertPointTy TaskAllocaIP =
1851 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1852 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1853 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
1854 return Err;
1855
1856 OutlineInfo OI;
1857 OI.EntryBB = TaskAllocaBB;
1858 OI.OuterAllocaBB = AllocaIP.getBlock();
1859 OI.ExitBB = TaskExitBB;
1860
1861 // Add the thread ID argument.
1864 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1865
1866 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1867 Mergeable, EventHandle, TaskAllocaBB,
1868 ToBeDeleted](Function &OutlinedFn) mutable {
1869 // Replace the Stale CI by appropriate RTL function call.
1870 assert(OutlinedFn.getNumUses() == 1 &&
1871 "there must be a single user for the outlined function");
1872 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1873
1874 // HasShareds is true if any variables are captured in the outlined region,
1875 // false otherwise.
1876 bool HasShareds = StaleCI->arg_size() > 1;
1877 Builder.SetInsertPoint(StaleCI);
1878
1879 // Gather the arguments for emitting the runtime call for
1880 // @__kmpc_omp_task_alloc
1881 Function *TaskAllocFn =
1882 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
1883
1884 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
1885 // call.
1886 Value *ThreadID = getOrCreateThreadID(Ident);
1887
1888 // Argument - `flags`
1889 // Task is tied iff (Flags & 1) == 1.
1890 // Task is untied iff (Flags & 1) == 0.
1891 // Task is final iff (Flags & 2) == 2.
1892 // Task is not final iff (Flags & 2) == 0.
1893 // Task is mergeable iff (Flags & 4) == 4.
1894 // Task is not mergeable iff (Flags & 4) == 0.
1895 // TODO: Handle the other flags.
1896 Value *Flags = Builder.getInt32(Tied);
1897 if (Final) {
1898 Value *FinalFlag =
1900 Flags = Builder.CreateOr(FinalFlag, Flags);
1901 }
1902
1903 if (Mergeable)
1905
1906 // Argument - `sizeof_kmp_task_t` (TaskSize)
1907 // Tasksize refers to the size in bytes of kmp_task_t data structure
1908 // including private vars accessed in task.
1909 // TODO: add kmp_task_t_with_privates (privates)
1910 Value *TaskSize = Builder.getInt64(
1912
1913 // Argument - `sizeof_shareds` (SharedsSize)
1914 // SharedsSize refers to the shareds array size in the kmp_task_t data
1915 // structure.
1916 Value *SharedsSize = Builder.getInt64(0);
1917 if (HasShareds) {
1918 AllocaInst *ArgStructAlloca =
1919 dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
1920 assert(ArgStructAlloca &&
1921 "Unable to find the alloca instruction corresponding to arguments "
1922 "for extracted function");
1923 StructType *ArgStructType =
1924 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
1925 assert(ArgStructType && "Unable to find struct type corresponding to "
1926 "arguments for extracted function");
1927 SharedsSize =
1929 }
1930 // Emit the @__kmpc_omp_task_alloc runtime call
1931 // The runtime call returns a pointer to an area where the task captured
1932 // variables must be copied before the task is run (TaskData)
1933 CallInst *TaskData = Builder.CreateCall(
1934 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
1935 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
1936 /*task_func=*/&OutlinedFn});
1937
1938 // Emit detach clause initialization.
1939 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
1940 // task_descriptor);
1941 if (EventHandle) {
1943 OMPRTL___kmpc_task_allow_completion_event);
1944 llvm::Value *EventVal =
1945 Builder.CreateCall(TaskDetachFn, {Ident, ThreadID, TaskData});
1946 llvm::Value *EventHandleAddr =
1948 Builder.getPtrTy(0));
1949 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
1950 Builder.CreateStore(EventVal, EventHandleAddr);
1951 }
1952 // Copy the arguments for outlined function
1953 if (HasShareds) {
1954 Value *Shareds = StaleCI->getArgOperand(1);
1955 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
1956 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
1957 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
1958 SharedsSize);
1959 }
1960
1961 Value *DepArray = nullptr;
1962 if (Dependencies.size()) {
1963 InsertPointTy OldIP = Builder.saveIP();
1965 &OldIP.getBlock()->getParent()->getEntryBlock().back());
1966
1967 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1968 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1969
1970 unsigned P = 0;
1971 for (const DependData &Dep : Dependencies) {
1972 Value *Base =
1973 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, P);
1974 // Store the pointer to the variable
1976 DependInfo, Base,
1977 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1978 Value *DepValPtr =
1980 Builder.CreateStore(DepValPtr, Addr);
1981 // Store the size of the variable
1983 DependInfo, Base,
1984 static_cast<unsigned int>(RTLDependInfoFields::Len));
1986 Dep.DepValueType)),
1987 Size);
1988 // Store the dependency kind
1990 DependInfo, Base,
1991 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1993 ConstantInt::get(Builder.getInt8Ty(),
1994 static_cast<unsigned int>(Dep.DepKind)),
1995 Flags);
1996 ++P;
1997 }
1998
1999 Builder.restoreIP(OldIP);
2000 }
2001
2002 // In the presence of the `if` clause, the following IR is generated:
2003 // ...
2004 // %data = call @__kmpc_omp_task_alloc(...)
2005 // br i1 %if_condition, label %then, label %else
2006 // then:
2007 // call @__kmpc_omp_task(...)
2008 // br label %exit
2009 // else:
2010 // ;; Wait for resolution of dependencies, if any, before
2011 // ;; beginning the task
2012 // call @__kmpc_omp_wait_deps(...)
2013 // call @__kmpc_omp_task_begin_if0(...)
2014 // call @outlined_fn(...)
2015 // call @__kmpc_omp_task_complete_if0(...)
2016 // br label %exit
2017 // exit:
2018 // ...
2019 if (IfCondition) {
2020 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2021 // terminator.
2022 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2023 Instruction *IfTerminator =
2024 Builder.GetInsertPoint()->getParent()->getTerminator();
2025 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2026 Builder.SetInsertPoint(IfTerminator);
2027 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2028 &ElseTI);
2029 Builder.SetInsertPoint(ElseTI);
2030
2031 if (Dependencies.size()) {
2032 Function *TaskWaitFn =
2033 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2035 TaskWaitFn,
2036 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2037 ConstantInt::get(Builder.getInt32Ty(), 0),
2039 }
2040 Function *TaskBeginFn =
2041 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2042 Function *TaskCompleteFn =
2043 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2044 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2045 CallInst *CI = nullptr;
2046 if (HasShareds)
2047 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
2048 else
2049 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
2050 CI->setDebugLoc(StaleCI->getDebugLoc());
2051 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2052 Builder.SetInsertPoint(ThenTI);
2053 }
2054
2055 if (Dependencies.size()) {
2056 Function *TaskFn =
2057 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2059 TaskFn,
2060 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2061 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2063
2064 } else {
2065 // Emit the @__kmpc_omp_task runtime call to spawn the task
2066 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2067 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
2068 }
2069
2070 StaleCI->eraseFromParent();
2071
2072 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2073 if (HasShareds) {
2074 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2075 OutlinedFn.getArg(1)->replaceUsesWithIf(
2076 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2077 }
2078
2079 for (Instruction *I : llvm::reverse(ToBeDeleted))
2080 I->eraseFromParent();
2081 };
2082
2083 addOutlineInfo(std::move(OI));
2084 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2085
2086 return Builder.saveIP();
2087}
2088
2091 InsertPointTy AllocaIP,
2092 BodyGenCallbackTy BodyGenCB) {
2093 if (!updateToLocation(Loc))
2094 return InsertPointTy();
2095
2096 uint32_t SrcLocStrSize;
2097 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2098 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2099 Value *ThreadID = getOrCreateThreadID(Ident);
2100
2101 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2102 Function *TaskgroupFn =
2103 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2104 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2105
2106 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2107 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2108 return Err;
2109
2110 Builder.SetInsertPoint(TaskgroupExitBB);
2111 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2112 Function *EndTaskgroupFn =
2113 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2114 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2115
2116 return Builder.saveIP();
2117}
2118
2120 const LocationDescription &Loc, InsertPointTy AllocaIP,
2122 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2123 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2124
2125 if (!updateToLocation(Loc))
2126 return Loc.IP;
2127
2128 auto FiniCBWrapper = [&](InsertPointTy IP) {
2129 if (IP.getBlock()->end() != IP.getPoint())
2130 return FiniCB(IP);
2131 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2132 // will fail because that function requires the Finalization Basic Block to
2133 // have a terminator, which is already removed by EmitOMPRegionBody.
2134 // IP is currently at cancelation block.
2135 // We need to backtrack to the condition block to fetch
2136 // the exit block and create a branch from cancelation
2137 // to exit block.
2139 Builder.restoreIP(IP);
2140 auto *CaseBB = IP.getBlock()->getSinglePredecessor();
2141 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2142 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2143 Instruction *I = Builder.CreateBr(ExitBB);
2144 IP = InsertPointTy(I->getParent(), I->getIterator());
2145 return FiniCB(IP);
2146 };
2147
2148 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2149
2150 // Each section is emitted as a switch case
2151 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2152 // -> OMP.createSection() which generates the IR for each section
2153 // Iterate through all sections and emit a switch construct:
2154 // switch (IV) {
2155 // case 0:
2156 // <SectionStmt[0]>;
2157 // break;
2158 // ...
2159 // case <NumSection> - 1:
2160 // <SectionStmt[<NumSection> - 1]>;
2161 // break;
2162 // }
2163 // ...
2164 // section_loop.after:
2165 // <FiniCB>;
2166 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2167 Builder.restoreIP(CodeGenIP);
2169 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2170 Function *CurFn = Continue->getParent();
2171 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2172
2173 unsigned CaseNumber = 0;
2174 for (auto SectionCB : SectionCBs) {
2176 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2177 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2178 Builder.SetInsertPoint(CaseBB);
2179 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2180 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2181 CaseEndBr->getIterator()}))
2182 return Err;
2183 CaseNumber++;
2184 }
2185 // remove the existing terminator from body BB since there can be no
2186 // terminators after switch/case
2187 return Error::success();
2188 };
2189 // Loop body ends here
2190 // LowerBound, UpperBound, and STride for createCanonicalLoop
2191 Type *I32Ty = Type::getInt32Ty(M.getContext());
2192 Value *LB = ConstantInt::get(I32Ty, 0);
2193 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2194 Value *ST = ConstantInt::get(I32Ty, 1);
2196 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2197 if (!LoopInfo)
2198 return LoopInfo.takeError();
2199
2200 InsertPointOrErrorTy WsloopIP =
2201 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP, !IsNowait);
2202 if (!WsloopIP)
2203 return WsloopIP.takeError();
2204 InsertPointTy AfterIP = *WsloopIP;
2205
2206 // Apply the finalization callback in LoopAfterBB
2207 auto FiniInfo = FinalizationStack.pop_back_val();
2208 assert(FiniInfo.DK == OMPD_sections &&
2209 "Unexpected finalization stack state!");
2210 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2211 Builder.restoreIP(AfterIP);
2212 BasicBlock *FiniBB =
2213 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2214 if (Error Err = CB(Builder.saveIP()))
2215 return Err;
2216 AfterIP = {FiniBB, FiniBB->begin()};
2217 }
2218
2219 return AfterIP;
2220}
2221
2224 BodyGenCallbackTy BodyGenCB,
2225 FinalizeCallbackTy FiniCB) {
2226 if (!updateToLocation(Loc))
2227 return Loc.IP;
2228
2229 auto FiniCBWrapper = [&](InsertPointTy IP) {
2230 if (IP.getBlock()->end() != IP.getPoint())
2231 return FiniCB(IP);
2232 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2233 // will fail because that function requires the Finalization Basic Block to
2234 // have a terminator, which is already removed by EmitOMPRegionBody.
2235 // IP is currently at cancelation block.
2236 // We need to backtrack to the condition block to fetch
2237 // the exit block and create a branch from cancelation
2238 // to exit block.
2240 Builder.restoreIP(IP);
2241 auto *CaseBB = Loc.IP.getBlock();
2242 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2243 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2244 Instruction *I = Builder.CreateBr(ExitBB);
2245 IP = InsertPointTy(I->getParent(), I->getIterator());
2246 return FiniCB(IP);
2247 };
2248
2249 Directive OMPD = Directive::OMPD_sections;
2250 // Since we are using Finalization Callback here, HasFinalize
2251 // and IsCancellable have to be true
2252 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2253 /*Conditional*/ false, /*hasFinalize*/ true,
2254 /*IsCancellable*/ true);
2255}
2256
2259 IT++;
2260 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2261}
2262
2263void OpenMPIRBuilder::emitUsed(StringRef Name,
2264 std::vector<WeakTrackingVH> &List) {
2265 if (List.empty())
2266 return;
2267
2268 // Convert List to what ConstantArray needs.
2270 UsedArray.resize(List.size());
2271 for (unsigned I = 0, E = List.size(); I != E; ++I)
2273 cast<Constant>(&*List[I]), Builder.getPtrTy());
2274
2275 if (UsedArray.empty())
2276 return;
2277 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
2278
2279 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
2280 ConstantArray::get(ATy, UsedArray), Name);
2281
2282 GV->setSection("llvm.metadata");
2283}
2284
2285Value *OpenMPIRBuilder::getGPUThreadID() {
2286 return Builder.CreateCall(
2288 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2289 {});
2290}
2291
2292Value *OpenMPIRBuilder::getGPUWarpSize() {
2293 return Builder.CreateCall(
2294 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2295}
2296
2297Value *OpenMPIRBuilder::getNVPTXWarpID() {
2298 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2299 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2300}
2301
2302Value *OpenMPIRBuilder::getNVPTXLaneID() {
2303 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2304 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2305 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2306 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2307 "nvptx_lane_id");
2308}
2309
2310Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2311 Type *ToType) {
2312 Type *FromType = From->getType();
2313 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2314 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2315 assert(FromSize > 0 && "From size must be greater than zero");
2316 assert(ToSize > 0 && "To size must be greater than zero");
2317 if (FromType == ToType)
2318 return From;
2319 if (FromSize == ToSize)
2320 return Builder.CreateBitCast(From, ToType);
2321 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2322 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2323 InsertPointTy SaveIP = Builder.saveIP();
2324 Builder.restoreIP(AllocaIP);
2325 Value *CastItem = Builder.CreateAlloca(ToType);
2326 Builder.restoreIP(SaveIP);
2327
2329 CastItem, Builder.getPtrTy(0));
2330 Builder.CreateStore(From, ValCastItem);
2331 return Builder.CreateLoad(ToType, CastItem);
2332}
2333
2334Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2335 Value *Element,
2336 Type *ElementType,
2337 Value *Offset) {
2338 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2339 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2340
2341 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2342 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2343 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2344 Value *WarpSize =
2345 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2347 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2348 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2349 Value *WarpSizeCast =
2350 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2351 Value *ShuffleCall =
2352 Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2353 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2354}
2355
2356void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2357 Value *DstAddr, Type *ElemType,
2358 Value *Offset, Type *ReductionArrayTy) {
2360 // Create the loop over the big sized data.
2361 // ptr = (void*)Elem;
2362 // ptrEnd = (void*) Elem + 1;
2363 // Step = 8;
2364 // while (ptr + Step < ptrEnd)
2365 // shuffle((int64_t)*ptr);
2366 // Step = 4;
2367 // while (ptr + Step < ptrEnd)
2368 // shuffle((int32_t)*ptr);
2369 // ...
2370 Type *IndexTy = Builder.getIndexTy(
2372 Value *ElemPtr = DstAddr;
2373 Value *Ptr = SrcAddr;
2374 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2375 if (Size < IntSize)
2376 continue;
2377 Type *IntType = Builder.getIntNTy(IntSize * 8);
2379 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2380 Value *SrcAddrGEP =
2381 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2383 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2384
2385 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2386 if ((Size / IntSize) > 1) {
2388 SrcAddrGEP, Builder.getPtrTy());
2389 BasicBlock *PreCondBB =
2390 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2391 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2392 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2393 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2394 emitBlock(PreCondBB, CurFunc);
2395 PHINode *PhiSrc =
2396 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2397 PhiSrc->addIncoming(Ptr, CurrentBB);
2398 PHINode *PhiDest =
2399 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2400 PhiDest->addIncoming(ElemPtr, CurrentBB);
2401 Ptr = PhiSrc;
2402 ElemPtr = PhiDest;
2403 Value *PtrDiff = Builder.CreatePtrDiff(
2404 Builder.getInt8Ty(), PtrEnd,
2407 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2408 ExitBB);
2409 emitBlock(ThenBB, CurFunc);
2410 Value *Res = createRuntimeShuffleFunction(
2411 AllocaIP,
2413 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2414 IntType, Offset);
2415 Builder.CreateAlignedStore(Res, ElemPtr,
2416 M.getDataLayout().getPrefTypeAlign(ElemType));
2417 Value *LocalPtr =
2418 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2419 Value *LocalElemPtr =
2420 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2421 PhiSrc->addIncoming(LocalPtr, ThenBB);
2422 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2423 emitBranch(PreCondBB);
2424 emitBlock(ExitBB, CurFunc);
2425 } else {
2426 Value *Res = createRuntimeShuffleFunction(
2427 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2428 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2429 Res->getType()->getScalarSizeInBits())
2430 Res = Builder.CreateTrunc(Res, ElemType);
2431 Builder.CreateStore(Res, ElemPtr);
2432 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2433 ElemPtr =
2434 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2435 }
2436 Size = Size % IntSize;
2437 }
2438}
2439
2440void OpenMPIRBuilder::emitReductionListCopy(
2441 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2442 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2443 CopyOptionsTy CopyOptions) {
2444 Type *IndexTy = Builder.getIndexTy(
2446 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2447
2448 // Iterates, element-by-element, through the source Reduce list and
2449 // make a copy.
2450 for (auto En : enumerate(ReductionInfos)) {
2451 const ReductionInfo &RI = En.value();
2452 Value *SrcElementAddr = nullptr;
2453 Value *DestElementAddr = nullptr;
2454 Value *DestElementPtrAddr = nullptr;
2455 // Should we shuffle in an element from a remote lane?
2456 bool ShuffleInElement = false;
2457 // Set to true to update the pointer in the dest Reduce list to a
2458 // newly created element.
2459 bool UpdateDestListPtr = false;
2460
2461 // Step 1.1: Get the address for the src element in the Reduce list.
2462 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2463 ReductionArrayTy, SrcBase,
2464 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2465 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2466
2467 // Step 1.2: Create a temporary to store the element in the destination
2468 // Reduce list.
2469 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2470 ReductionArrayTy, DestBase,
2471 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2472 switch (Action) {
2474 InsertPointTy CurIP = Builder.saveIP();
2475 Builder.restoreIP(AllocaIP);
2476 AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
2477 ".omp.reduction.element");
2478 DestAlloca->setAlignment(
2479 M.getDataLayout().getPrefTypeAlign(RI.ElementType));
2480 DestElementAddr = DestAlloca;
2481 DestElementAddr =
2482 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2483 DestElementAddr->getName() + ".ascast");
2484 Builder.restoreIP(CurIP);
2485 ShuffleInElement = true;
2486 UpdateDestListPtr = true;
2487 break;
2488 }
2490 DestElementAddr =
2491 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2492 break;
2493 }
2494 }
2495
2496 // Now that all active lanes have read the element in the
2497 // Reduce list, shuffle over the value from the remote lane.
2498 if (ShuffleInElement) {
2499 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
2500 RemoteLaneOffset, ReductionArrayTy);
2501 } else {
2502 switch (RI.EvaluationKind) {
2503 case EvalKind::Scalar: {
2504 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2505 // Store the source element value to the dest element address.
2506 Builder.CreateStore(Elem, DestElementAddr);
2507 break;
2508 }
2509 case EvalKind::Complex: {
2511 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2512 Value *SrcReal = Builder.CreateLoad(
2513 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2515 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2516 Value *SrcImg = Builder.CreateLoad(
2517 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2518
2520 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2522 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2523 Builder.CreateStore(SrcReal, DestRealPtr);
2524 Builder.CreateStore(SrcImg, DestImgPtr);
2525 break;
2526 }
2527 case EvalKind::Aggregate: {
2528 Value *SizeVal = Builder.getInt64(
2529 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2531 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2532 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2533 SizeVal, false);
2534 break;
2535 }
2536 };
2537 }
2538
2539 // Step 3.1: Modify reference in dest Reduce list as needed.
2540 // Modifying the reference in Reduce list to point to the newly
2541 // created element. The element is live in the current function
2542 // scope and that of functions it invokes (i.e., reduce_function).
2543 // RemoteReduceData[i] = (void*)&RemoteElem
2544 if (UpdateDestListPtr) {
2546 DestElementAddr, Builder.getPtrTy(),
2547 DestElementAddr->getName() + ".ascast");
2548 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2549 }
2550 }
2551}
2552
2553Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
2554 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2555 AttributeList FuncAttrs) {
2556 InsertPointTy SavedIP = Builder.saveIP();
2557 LLVMContext &Ctx = M.getContext();
2559 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2560 /* IsVarArg */ false);
2561 Function *WcFunc =
2563 "_omp_reduction_inter_warp_copy_func", &M);
2564 WcFunc->setAttributes(FuncAttrs);
2565 WcFunc->addParamAttr(0, Attribute::NoUndef);
2566 WcFunc->addParamAttr(1, Attribute::NoUndef);
2567 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2568 Builder.SetInsertPoint(EntryBB);
2569
2570 // ReduceList: thread local Reduce list.
2571 // At the stage of the computation when this function is called, partially
2572 // aggregated values reside in the first lane of every active warp.
2573 Argument *ReduceListArg = WcFunc->getArg(0);
2574 // NumWarps: number of warps active in the parallel region. This could
2575 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2576 Argument *NumWarpsArg = WcFunc->getArg(1);
2577
2578 // This array is used as a medium to transfer, one reduce element at a time,
2579 // the data from the first lane of every warp to lanes in the first warp
2580 // in order to perform the final step of a reduction in a parallel region
2581 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2582 // for reduced latency, as well as to have a distinct copy for concurrently
2583 // executing target regions. The array is declared with common linkage so
2584 // as to be shared across compilation units.
2585 StringRef TransferMediumName =
2586 "__openmp_nvptx_data_transfer_temporary_storage";
2587 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2588 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2589 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2590 if (!TransferMedium) {
2591 TransferMedium = new GlobalVariable(
2592 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2593 UndefValue::get(ArrayTy), TransferMediumName,
2594 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2595 /*AddressSpace=*/3);
2596 }
2597
2598 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2599 Value *GPUThreadID = getGPUThreadID();
2600 // nvptx_lane_id = nvptx_id % warpsize
2601 Value *LaneID = getNVPTXLaneID();
2602 // nvptx_warp_id = nvptx_id / warpsize
2603 Value *WarpID = getNVPTXWarpID();
2604
2605 InsertPointTy AllocaIP =
2608 Type *Arg0Type = ReduceListArg->getType();
2609 Type *Arg1Type = NumWarpsArg->getType();
2610 Builder.restoreIP(AllocaIP);
2611 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2612 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2613 AllocaInst *NumWarpsAlloca =
2614 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2616 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2618 NumWarpsAlloca, Builder.getPtrTy(0),
2619 NumWarpsAlloca->getName() + ".ascast");
2620 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2621 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2622 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2623 InsertPointTy CodeGenIP =
2625 Builder.restoreIP(CodeGenIP);
2626
2627 Value *ReduceList =
2628 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2629
2630 for (auto En : enumerate(ReductionInfos)) {
2631 //
2632 // Warp master copies reduce element to transfer medium in __shared__
2633 // memory.
2634 //
2635 const ReductionInfo &RI = En.value();
2636 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
2637 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2638 Type *CType = Builder.getIntNTy(TySize * 8);
2639
2640 unsigned NumIters = RealTySize / TySize;
2641 if (NumIters == 0)
2642 continue;
2643 Value *Cnt = nullptr;
2644 Value *CntAddr = nullptr;
2645 BasicBlock *PrecondBB = nullptr;
2646 BasicBlock *ExitBB = nullptr;
2647 if (NumIters > 1) {
2648 CodeGenIP = Builder.saveIP();
2649 Builder.restoreIP(AllocaIP);
2650 CntAddr =
2651 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2652
2653 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2654 CntAddr->getName() + ".ascast");
2655 Builder.restoreIP(CodeGenIP);
2657 CntAddr,
2658 /*Volatile=*/false);
2659 PrecondBB = BasicBlock::Create(Ctx, "precond");
2660 ExitBB = BasicBlock::Create(Ctx, "exit");
2661 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2662 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2663 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2664 /*Volatile=*/false);
2666 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2667 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2669 }
2670
2671 // kmpc_barrier.
2672 InsertPointOrErrorTy BarrierIP1 =
2673 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2674 omp::Directive::OMPD_unknown,
2675 /* ForceSimpleCall */ false,
2676 /* CheckCancelFlag */ true);
2677 if (!BarrierIP1)
2678 return BarrierIP1.takeError();
2679 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2680 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2681 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2682
2683 // if (lane_id == 0)
2684 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2685 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2687
2688 // Reduce element = LocalReduceList[i]
2689 auto *RedListArrayTy =
2690 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2691 Type *IndexTy = Builder.getIndexTy(
2693 Value *ElemPtrPtr =
2694 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2695 {ConstantInt::get(IndexTy, 0),
2696 ConstantInt::get(IndexTy, En.index())});
2697 // elemptr = ((CopyType*)(elemptrptr)) + I
2698 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2699 if (NumIters > 1)
2700 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2701
2702 // Get pointer to location in transfer medium.
2703 // MediumPtr = &medium[warp_id]
2704 Value *MediumPtr = Builder.CreateInBoundsGEP(
2705 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2706 // elem = *elemptr
2707 //*MediumPtr = elem
2708 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2709 // Store the source element value to the dest element address.
2710 Builder.CreateStore(Elem, MediumPtr,
2711 /*IsVolatile*/ true);
2712 Builder.CreateBr(MergeBB);
2713
2714 // else
2716 Builder.CreateBr(MergeBB);
2717
2718 // endif
2720 InsertPointOrErrorTy BarrierIP2 =
2721 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2722 omp::Directive::OMPD_unknown,
2723 /* ForceSimpleCall */ false,
2724 /* CheckCancelFlag */ true);
2725 if (!BarrierIP2)
2726 return BarrierIP2.takeError();
2727
2728 // Warp 0 copies reduce element from transfer medium
2729 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2730 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2731 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2732
2733 Value *NumWarpsVal =
2734 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2735 // Up to 32 threads in warp 0 are active.
2736 Value *IsActiveThread =
2737 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2738 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2739
2740 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2741
2742 // SecMediumPtr = &medium[tid]
2743 // SrcMediumVal = *SrcMediumPtr
2744 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2745 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2746 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2747 Value *TargetElemPtrPtr =
2748 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2749 {ConstantInt::get(IndexTy, 0),
2750 ConstantInt::get(IndexTy, En.index())});
2751 Value *TargetElemPtrVal =
2752 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2753 Value *TargetElemPtr = TargetElemPtrVal;
2754 if (NumIters > 1)
2755 TargetElemPtr =
2756 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2757
2758 // *TargetElemPtr = SrcMediumVal;
2759 Value *SrcMediumValue =
2760 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2761 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2762 Builder.CreateBr(W0MergeBB);
2763
2764 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2765 Builder.CreateBr(W0MergeBB);
2766
2767 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2768
2769 if (NumIters > 1) {
2770 Cnt = Builder.CreateNSWAdd(
2771 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2772 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2773
2774 auto *CurFn = Builder.GetInsertBlock()->getParent();
2775 emitBranch(PrecondBB);
2776 emitBlock(ExitBB, CurFn);
2777 }
2778 RealTySize %= TySize;
2779 }
2780 }
2781
2783 Builder.restoreIP(SavedIP);
2784
2785 return WcFunc;
2786}
2787
2788Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
2789 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2790 AttributeList FuncAttrs) {
2791 LLVMContext &Ctx = M.getContext();
2792 FunctionType *FuncTy =
2794 {Builder.getPtrTy(), Builder.getInt16Ty(),
2795 Builder.getInt16Ty(), Builder.getInt16Ty()},
2796 /* IsVarArg */ false);
2797 Function *SarFunc =
2799 "_omp_reduction_shuffle_and_reduce_func", &M);
2800 SarFunc->setAttributes(FuncAttrs);
2801 SarFunc->addParamAttr(0, Attribute::NoUndef);
2802 SarFunc->addParamAttr(1, Attribute::NoUndef);
2803 SarFunc->addParamAttr(2, Attribute::NoUndef);
2804 SarFunc->addParamAttr(3, Attribute::NoUndef);
2805 SarFunc->addParamAttr(1, Attribute::SExt);
2806 SarFunc->addParamAttr(2, Attribute::SExt);
2807 SarFunc->addParamAttr(3, Attribute::SExt);
2808 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
2809 Builder.SetInsertPoint(EntryBB);
2810
2811 // Thread local Reduce list used to host the values of data to be reduced.
2812 Argument *ReduceListArg = SarFunc->getArg(0);
2813 // Current lane id; could be logical.
2814 Argument *LaneIDArg = SarFunc->getArg(1);
2815 // Offset of the remote source lane relative to the current lane.
2816 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
2817 // Algorithm version. This is expected to be known at compile time.
2818 Argument *AlgoVerArg = SarFunc->getArg(3);
2819
2820 Type *ReduceListArgType = ReduceListArg->getType();
2821 Type *LaneIDArgType = LaneIDArg->getType();
2822 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
2823 Value *ReduceListAlloca = Builder.CreateAlloca(
2824 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
2825 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2826 LaneIDArg->getName() + ".addr");
2827 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
2828 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
2829 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2830 AlgoVerArg->getName() + ".addr");
2831 ArrayType *RedListArrayTy =
2832 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2833
2834 // Create a local thread-private variable to host the Reduce list
2835 // from a remote lane.
2836 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
2837 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
2838
2840 ReduceListAlloca, ReduceListArgType,
2841 ReduceListAlloca->getName() + ".ascast");
2843 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
2844 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2845 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
2846 RemoteLaneOffsetAlloca->getName() + ".ascast");
2848 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
2850 RemoteReductionListAlloca, Builder.getPtrTy(),
2851 RemoteReductionListAlloca->getName() + ".ascast");
2852
2853 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2854 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
2855 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
2856 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
2857
2858 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
2859 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
2860 Value *RemoteLaneOffset =
2861 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
2862 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
2863
2864 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
2865
2866 // This loop iterates through the list of reduce elements and copies,
2867 // element by element, from a remote lane in the warp to RemoteReduceList,
2868 // hosted on the thread's stack.
2869 emitReductionListCopy(
2870 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
2871 ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
2872
2873 // The actions to be performed on the Remote Reduce list is dependent
2874 // on the algorithm version.
2875 //
2876 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2877 // LaneId % 2 == 0 && Offset > 0):
2878 // do the reduction value aggregation
2879 //
2880 // The thread local variable Reduce list is mutated in place to host the
2881 // reduced data, which is the aggregated value produced from local and
2882 // remote lanes.
2883 //
2884 // Note that AlgoVer is expected to be a constant integer known at compile
2885 // time.
2886 // When AlgoVer==0, the first conjunction evaluates to true, making
2887 // the entire predicate true during compile time.
2888 // When AlgoVer==1, the second conjunction has only the second part to be
2889 // evaluated during runtime. Other conjunctions evaluates to false
2890 // during compile time.
2891 // When AlgoVer==2, the third conjunction has only the second part to be
2892 // evaluated during runtime. Other conjunctions evaluates to false
2893 // during compile time.
2894 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
2895 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2896 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
2897 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
2898 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
2899 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
2900 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
2901 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
2902 Value *RemoteOffsetComp =
2903 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
2904 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
2905 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
2906 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
2907
2908 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2909 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2910 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2911
2912 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
2915 ReduceList, Builder.getPtrTy());
2916 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2917 RemoteListAddrCast, Builder.getPtrTy());
2918 Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
2919 ->addFnAttr(Attribute::NoUnwind);
2920 Builder.CreateBr(MergeBB);
2921
2923 Builder.CreateBr(MergeBB);
2924
2926
2927 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2928 // Reduce list.
2929 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2930 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
2931 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
2932
2933 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
2934 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
2935 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
2936 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2937
2938 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
2939 emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
2940 ReductionInfos, RemoteListAddrCast, ReduceList);
2941 Builder.CreateBr(CpyMergeBB);
2942
2943 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
2944 Builder.CreateBr(CpyMergeBB);
2945
2946 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
2947
2949
2950 return SarFunc;
2951}
2952
2953Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
2954 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
2955 AttributeList FuncAttrs) {
2957 LLVMContext &Ctx = M.getContext();
2960 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
2961 /* IsVarArg */ false);
2962 Function *LtGCFunc =
2964 "_omp_reduction_list_to_global_copy_func", &M);
2965 LtGCFunc->setAttributes(FuncAttrs);
2966 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
2967 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
2968 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
2969
2970 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
2971 Builder.SetInsertPoint(EntryBlock);
2972
2973 // Buffer: global reduction buffer.
2974 Argument *BufferArg = LtGCFunc->getArg(0);
2975 // Idx: index of the buffer.
2976 Argument *IdxArg = LtGCFunc->getArg(1);
2977 // ReduceList: thread local Reduce list.
2978 Argument *ReduceListArg = LtGCFunc->getArg(2);
2979
2980 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
2981 BufferArg->getName() + ".addr");
2982 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
2983 IdxArg->getName() + ".addr");
2984 Value *ReduceListArgAlloca = Builder.CreateAlloca(
2985 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
2987 BufferArgAlloca, Builder.getPtrTy(),
2988 BufferArgAlloca->getName() + ".ascast");
2990 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
2991 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2992 ReduceListArgAlloca, Builder.getPtrTy(),
2993 ReduceListArgAlloca->getName() + ".ascast");
2994
2995 Builder.CreateStore(BufferArg, BufferArgAddrCast);
2996 Builder.CreateStore(IdxArg, IdxArgAddrCast);
2997 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
2998
2999 Value *LocalReduceList =
3000 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3001 Value *BufferArgVal =
3002 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3003 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3004 Type *IndexTy = Builder.getIndexTy(
3006 for (auto En : enumerate(ReductionInfos)) {
3007 const ReductionInfo &RI = En.value();
3008 auto *RedListArrayTy =
3009 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3010 // Reduce element = LocalReduceList[i]
3011 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3012 RedListArrayTy, LocalReduceList,
3013 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3014 // elemptr = ((CopyType*)(elemptrptr)) + I
3015 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3016
3017 // Global = Buffer.VD[Idx];
3018 Value *BufferVD =
3019 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3021 ReductionsBufferTy, BufferVD, 0, En.index());
3022
3023 switch (RI.EvaluationKind) {
3024 case EvalKind::Scalar: {
3025 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3026 Builder.CreateStore(TargetElement, GlobVal);
3027 break;
3028 }
3029 case EvalKind::Complex: {
3031 RI.ElementType, ElemPtr, 0, 0, ".realp");
3032 Value *SrcReal = Builder.CreateLoad(
3033 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3035 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3036 Value *SrcImg = Builder.CreateLoad(
3037 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3038
3040 RI.ElementType, GlobVal, 0, 0, ".realp");
3042 RI.ElementType, GlobVal, 0, 1, ".imagp");
3043 Builder.CreateStore(SrcReal, DestRealPtr);
3044 Builder.CreateStore(SrcImg, DestImgPtr);
3045 break;
3046 }
3047 case EvalKind::Aggregate: {
3048 Value *SizeVal =
3049 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3051 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3052 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3053 break;
3054 }
3055 }
3056 }
3057
3059 Builder.restoreIP(OldIP);
3060 return LtGCFunc;
3061}
3062
3063Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
3064 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3065 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3067 LLVMContext &Ctx = M.getContext();
3070 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3071 /* IsVarArg */ false);
3072 Function *LtGRFunc =
3074 "_omp_reduction_list_to_global_reduce_func", &M);
3075 LtGRFunc->setAttributes(FuncAttrs);
3076 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3077 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3078 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3079
3080 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3081 Builder.SetInsertPoint(EntryBlock);
3082
3083 // Buffer: global reduction buffer.
3084 Argument *BufferArg = LtGRFunc->getArg(0);
3085 // Idx: index of the buffer.
3086 Argument *IdxArg = LtGRFunc->getArg(1);
3087 // ReduceList: thread local Reduce list.
3088 Argument *ReduceListArg = LtGRFunc->getArg(2);
3089
3090 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3091 BufferArg->getName() + ".addr");
3092 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3093 IdxArg->getName() + ".addr");
3094 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3095 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3096 auto *RedListArrayTy =
3097 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3098
3099 // 1. Build a list of reduction variables.
3100 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3101 Value *LocalReduceList =
3102 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3103
3105 BufferArgAlloca, Builder.getPtrTy(),
3106 BufferArgAlloca->getName() + ".ascast");
3108 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3109 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3110 ReduceListArgAlloca, Builder.getPtrTy(),
3111 ReduceListArgAlloca->getName() + ".ascast");
3112 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3113 LocalReduceList, Builder.getPtrTy(),
3114 LocalReduceList->getName() + ".ascast");
3115
3116 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3117 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3118 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3119
3120 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3121 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3122 Type *IndexTy = Builder.getIndexTy(
3124 for (auto En : enumerate(ReductionInfos)) {
3125 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3126 RedListArrayTy, LocalReduceListAddrCast,
3127 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3128 Value *BufferVD =
3129 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3130 // Global = Buffer.VD[Idx];
3132 ReductionsBufferTy, BufferVD, 0, En.index());
3133 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3134 }
3135
3136 // Call reduce_function(GlobalReduceList, ReduceList)
3137 Value *ReduceList =
3138 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3139 Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3140 ->addFnAttr(Attribute::NoUnwind);
3142 Builder.restoreIP(OldIP);
3143 return LtGRFunc;
3144}
3145
3146Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3147 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3148 AttributeList FuncAttrs) {
3150 LLVMContext &Ctx = M.getContext();
3153 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3154 /* IsVarArg */ false);
3155 Function *LtGCFunc =
3157 "_omp_reduction_global_to_list_copy_func", &M);
3158 LtGCFunc->setAttributes(FuncAttrs);
3159 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3160 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3161 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3162
3163 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3164 Builder.SetInsertPoint(EntryBlock);
3165
3166 // Buffer: global reduction buffer.
3167 Argument *BufferArg = LtGCFunc->getArg(0);
3168 // Idx: index of the buffer.
3169 Argument *IdxArg = LtGCFunc->getArg(1);
3170 // ReduceList: thread local Reduce list.
3171 Argument *ReduceListArg = LtGCFunc->getArg(2);
3172
3173 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3174 BufferArg->getName() + ".addr");
3175 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3176 IdxArg->getName() + ".addr");
3177 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3178 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3180 BufferArgAlloca, Builder.getPtrTy(),
3181 BufferArgAlloca->getName() + ".ascast");
3183 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3184 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3185 ReduceListArgAlloca, Builder.getPtrTy(),
3186 ReduceListArgAlloca->getName() + ".ascast");
3187 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3188 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3189 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3190
3191 Value *LocalReduceList =
3192 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3193 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3194 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3195 Type *IndexTy = Builder.getIndexTy(
3197 for (auto En : enumerate(ReductionInfos)) {
3198 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3199 auto *RedListArrayTy =
3200 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3201 // Reduce element = LocalReduceList[i]
3202 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3203 RedListArrayTy, LocalReduceList,
3204 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3205 // elemptr = ((CopyType*)(elemptrptr)) + I
3206 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3207 // Global = Buffer.VD[Idx];
3208 Value *BufferVD =
3209 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3211 ReductionsBufferTy, BufferVD, 0, En.index());
3212
3213 switch (RI.EvaluationKind) {
3214 case EvalKind::Scalar: {
3215 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3216 Builder.CreateStore(TargetElement, ElemPtr);
3217 break;
3218 }
3219 case EvalKind::Complex: {
3221 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3222 Value *SrcReal = Builder.CreateLoad(
3223 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3225 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3226 Value *SrcImg = Builder.CreateLoad(
3227 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3228
3230 RI.ElementType, ElemPtr, 0, 0, ".realp");
3232 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3233 Builder.CreateStore(SrcReal, DestRealPtr);
3234 Builder.CreateStore(SrcImg, DestImgPtr);
3235 break;
3236 }
3237 case EvalKind::Aggregate: {
3238 Value *SizeVal =
3242 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3243 SizeVal, false);
3244 break;
3245 }
3246 }
3247 }
3248
3250 Builder.restoreIP(OldIP);
3251 return LtGCFunc;
3252}
3253
3254Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3255 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3256 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3258 LLVMContext &Ctx = M.getContext();
3259 auto *FuncTy = FunctionType::get(
3261 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3262 /* IsVarArg */ false);
3263 Function *LtGRFunc =
3265 "_omp_reduction_global_to_list_reduce_func", &M);
3266 LtGRFunc->setAttributes(FuncAttrs);
3267 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3268 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3269 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3270
3271 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3272 Builder.SetInsertPoint(EntryBlock);
3273
3274 // Buffer: global reduction buffer.
3275 Argument *BufferArg = LtGRFunc->getArg(0);
3276 // Idx: index of the buffer.
3277 Argument *IdxArg = LtGRFunc->getArg(1);
3278 // ReduceList: thread local Reduce list.
3279 Argument *ReduceListArg = LtGRFunc->getArg(2);
3280
3281 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3282 BufferArg->getName() + ".addr");
3283 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3284 IdxArg->getName() + ".addr");
3285 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3286 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3287 ArrayType *RedListArrayTy =
3288 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3289
3290 // 1. Build a list of reduction variables.
3291 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3292 Value *LocalReduceList =
3293 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3294
3296 BufferArgAlloca, Builder.getPtrTy(),
3297 BufferArgAlloca->getName() + ".ascast");
3299 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3300 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3301 ReduceListArgAlloca, Builder.getPtrTy(),
3302 ReduceListArgAlloca->getName() + ".ascast");
3304 LocalReduceList, Builder.getPtrTy(),
3305 LocalReduceList->getName() + ".ascast");
3306
3307 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3308 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3309 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3310
3311 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3312 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3313 Type *IndexTy = Builder.getIndexTy(
3315 for (auto En : enumerate(ReductionInfos)) {
3316 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3317 RedListArrayTy, ReductionList,
3318 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3319 // Global = Buffer.VD[Idx];
3320 Value *BufferVD =
3321 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3323 ReductionsBufferTy, BufferVD, 0, En.index());
3324 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3325 }
3326
3327 // Call reduce_function(ReduceList, GlobalReduceList)
3328 Value *ReduceList =
3329 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3330 Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
3331 ->addFnAttr(Attribute::NoUnwind);
3333 Builder.restoreIP(OldIP);
3334 return LtGRFunc;
3335}
3336
3337std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3338 std::string Suffix =
3339 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3340 return (Name + Suffix).str();
3341}
3342
3343Expected<Function *> OpenMPIRBuilder::createReductionFunction(
3344 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3345 ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
3346 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3347 {Builder.getPtrTy(), Builder.getPtrTy()},
3348 /* IsVarArg */ false);
3349 std::string Name = getReductionFuncName(ReducerName);
3350 Function *ReductionFunc =
3352 ReductionFunc->setAttributes(FuncAttrs);
3353 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3354 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3355 BasicBlock *EntryBB =
3356 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3357 Builder.SetInsertPoint(EntryBB);
3358
3359 // Need to alloca memory here and deal with the pointers before getting
3360 // LHS/RHS pointers out
3361 Value *LHSArrayPtr = nullptr;
3362 Value *RHSArrayPtr = nullptr;
3363 Argument *Arg0 = ReductionFunc->getArg(0);
3364 Argument *Arg1 = ReductionFunc->getArg(1);
3365 Type *Arg0Type = Arg0->getType();
3366 Type *Arg1Type = Arg1->getType();
3367
3368 Value *LHSAlloca =
3369 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3370 Value *RHSAlloca =
3371 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3373 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3375 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3376 Builder.CreateStore(Arg0, LHSAddrCast);
3377 Builder.CreateStore(Arg1, RHSAddrCast);
3378 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3379 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3380
3381 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3382 Type *IndexTy = Builder.getIndexTy(
3384 SmallVector<Value *> LHSPtrs, RHSPtrs;
3385 for (auto En : enumerate(ReductionInfos)) {
3386 const ReductionInfo &RI = En.value();
3387 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3388 RedArrayTy, RHSArrayPtr,
3389 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3390 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3392 RHSI8Ptr, RI.PrivateVariable->getType(),
3393 RHSI8Ptr->getName() + ".ascast");
3394
3395 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3396 RedArrayTy, LHSArrayPtr,
3397 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3398 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3400 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3401
3403 LHSPtrs.emplace_back(LHSPtr);
3404 RHSPtrs.emplace_back(RHSPtr);
3405 } else {
3406 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3407 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3408 Value *Reduced;
3409 InsertPointOrErrorTy AfterIP =
3410 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3411 if (!AfterIP)
3412 return AfterIP.takeError();
3413 if (!Builder.GetInsertBlock())
3414 return ReductionFunc;
3415 Builder.CreateStore(Reduced, LHSPtr);
3416 }
3417 }
3418
3420 for (auto En : enumerate(ReductionInfos)) {
3421 unsigned Index = En.index();
3422 const ReductionInfo &RI = En.value();
3423 Value *LHSFixupPtr, *RHSFixupPtr;
3424 Builder.restoreIP(RI.ReductionGenClang(
3425 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3426
3427 // Fix the CallBack code genereated to use the correct Values for the LHS
3428 // and RHS
3429 LHSFixupPtr->replaceUsesWithIf(
3430 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3431 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3432 ReductionFunc;
3433 });
3434 RHSFixupPtr->replaceUsesWithIf(
3435 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3436 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3437 ReductionFunc;
3438 });
3439 }
3440
3442 return ReductionFunc;
3443}
3444
3445static void
3447 bool IsGPU) {
3448 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3449 (void)RI;
3450 assert(RI.Variable && "expected non-null variable");
3451 assert(RI.PrivateVariable && "expected non-null private variable");
3452 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3453 "expected non-null reduction generator callback");
3454 if (!IsGPU) {
3455 assert(
3456 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3457 "expected variables and their private equivalents to have the same "
3458 "type");
3459 }
3460 assert(RI.Variable->getType()->isPointerTy() &&
3461 "expected variables to be pointers");
3462 }
3463}
3464
3466 const LocationDescription &Loc, InsertPointTy AllocaIP,
3467 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3468 bool IsNoWait, bool IsTeamsReduction, bool HasDistribute,
3469 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
3470 unsigned ReductionBufNum, Value *SrcLocInfo) {
3471 if (!updateToLocation(Loc))
3472 return InsertPointTy();
3473 Builder.restoreIP(CodeGenIP);
3474 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3475 LLVMContext &Ctx = M.getContext();
3476
3477 // Source location for the ident struct
3478 if (!SrcLocInfo) {
3479 uint32_t SrcLocStrSize;
3480 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3481 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3482 }
3483
3484 if (ReductionInfos.size() == 0)
3485 return Builder.saveIP();
3486
3487 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3488 AttributeList FuncAttrs;
3489 AttrBuilder AttrBldr(Ctx);
3490 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3491 AttrBldr.addAttribute(Attr);
3492 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3493 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3494
3495 CodeGenIP = Builder.saveIP();
3496 Expected<Function *> ReductionResult =
3497 createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
3498 ReductionInfos, ReductionGenCBKind, FuncAttrs);
3499 if (!ReductionResult)
3500 return ReductionResult.takeError();
3501 Function *ReductionFunc = *ReductionResult;
3502 Builder.restoreIP(CodeGenIP);
3503
3504 // Set the grid value in the config needed for lowering later on
3505 if (GridValue.has_value())
3506 Config.setGridValue(GridValue.value());
3507 else
3508 Config.setGridValue(getGridValue(T, ReductionFunc));
3509
3510 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3511 // RedList, shuffle_reduce_func, interwarp_copy_func);
3512 // or
3513 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3514 Value *Res;
3515
3516 // 1. Build a list of reduction variables.
3517 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3518 auto Size = ReductionInfos.size();
3519 Type *PtrTy = PointerType::getUnqual(Ctx);
3520 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3521 CodeGenIP = Builder.saveIP();
3522 Builder.restoreIP(AllocaIP);
3523 Value *ReductionListAlloca =
3524 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3526 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3527 Builder.restoreIP(CodeGenIP);
3528 Type *IndexTy = Builder.getIndexTy(
3530 for (auto En : enumerate(ReductionInfos)) {
3531 const ReductionInfo &RI = En.value();
3532 Value *ElemPtr = Builder.CreateInBoundsGEP(
3533 RedArrayTy, ReductionList,
3534 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3535 Value *CastElem =
3537 Builder.CreateStore(CastElem, ElemPtr);
3538 }
3539 CodeGenIP = Builder.saveIP();
3540 Function *SarFunc =
3541 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
3542 Expected<Function *> CopyResult =
3543 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
3544 if (!CopyResult)
3545 return CopyResult.takeError();
3546 Function *WcFunc = *CopyResult;
3547 Builder.restoreIP(CodeGenIP);
3548
3549 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3550
3551 unsigned MaxDataSize = 0;
3552 SmallVector<Type *> ReductionTypeArgs;
3553 for (auto En : enumerate(ReductionInfos)) {
3554 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3555 if (Size > MaxDataSize)
3556 MaxDataSize = Size;
3557 ReductionTypeArgs.emplace_back(En.value().ElementType);
3558 }
3559 Value *ReductionDataSize =
3560 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3561 if (!IsTeamsReduction) {
3562 Value *SarFuncCast =
3564 Value *WcFuncCast =
3566 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
3567 WcFuncCast};
3569 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3570 Res = Builder.CreateCall(Pv2Ptr, Args);
3571 } else {
3572 CodeGenIP = Builder.saveIP();
3573 StructType *ReductionsBufferTy = StructType::create(
3574 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3575 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3576 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3577 Function *LtGCFunc = emitListToGlobalCopyFunction(
3578 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3579 Function *LtGRFunc = emitListToGlobalReduceFunction(
3580 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3581 Function *GtLCFunc = emitGlobalToListCopyFunction(
3582 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3583 Function *GtLRFunc = emitGlobalToListReduceFunction(
3584 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3585 Builder.restoreIP(CodeGenIP);
3586
3587 Value *KernelTeamsReductionPtr = Builder.CreateCall(
3588 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3589
3590 Value *Args3[] = {SrcLocInfo,
3591 KernelTeamsReductionPtr,
3592 Builder.getInt32(ReductionBufNum),
3593 ReductionDataSize,
3594 RL,
3595 SarFunc,
3596 WcFunc,
3597 LtGCFunc,
3598 LtGRFunc,
3599 GtLCFunc,
3600 GtLRFunc};
3601
3602 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3603 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3604 Res = Builder.CreateCall(TeamsReduceFn, Args3);
3605 }
3606
3607 // 5. Build if (res == 1)
3608 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3609 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3611 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3612
3613 // 6. Build then branch: where we have reduced values in the master
3614 // thread in each team.
3615 // __kmpc_end_reduce{_nowait}(<gtid>);
3616 // break;
3617 emitBlock(ThenBB, CurFunc);
3618
3619 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3620 for (auto En : enumerate(ReductionInfos)) {
3621 const ReductionInfo &RI = En.value();
3622 Value *LHS = RI.Variable;
3623 Value *RHS =
3625
3627 Value *LHSPtr, *RHSPtr;
3629 &LHSPtr, &RHSPtr, CurFunc));
3630
3631 // Fix the CallBack code genereated to use the correct Values for the LHS
3632 // and RHS
3633 LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
3634 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3635 ReductionFunc;
3636 });
3637 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3638 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3639 ReductionFunc;
3640 });
3641 } else {
3642 assert(false && "Unhandled ReductionGenCBKind");
3643 }
3644 }
3645 emitBlock(ExitBB, CurFunc);
3646
3648
3649 return Builder.saveIP();
3650}
3651
3653 Type *VoidTy = Type::getVoidTy(M.getContext());
3654 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3655 auto *FuncTy =
3656 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3658 ".omp.reduction.func", &M);
3659}
3660
3663 InsertPointTy AllocaIP,
3664 ArrayRef<ReductionInfo> ReductionInfos,
3665 ArrayRef<bool> IsByRef, bool IsNoWait) {
3666 assert(ReductionInfos.size() == IsByRef.size());
3667 for (const ReductionInfo &RI : ReductionInfos) {
3668 (void)RI;
3669 assert(RI.Variable && "expected non-null variable");
3670 assert(RI.PrivateVariable && "expected non-null private variable");
3671 assert(RI.ReductionGen && "expected non-null reduction generator callback");
3672 assert(RI.Variable->getType() == RI.PrivateVariable->getType() &&
3673 "expected variables and their private equivalents to have the same "
3674 "type");
3675 assert(RI.Variable->getType()->isPointerTy() &&
3676 "expected variables to be pointers");
3677 }
3678
3679 if (!updateToLocation(Loc))
3680 return InsertPointTy();
3681
3682 BasicBlock *InsertBlock = Loc.IP.getBlock();
3683 BasicBlock *ContinuationBlock =
3684 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3685 InsertBlock->getTerminator()->eraseFromParent();
3686
3687 // Create and populate array of type-erased pointers to private reduction
3688 // values.
3689 unsigned NumReductions = ReductionInfos.size();
3690 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3692 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
3693
3694 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3695
3696 for (auto En : enumerate(ReductionInfos)) {
3697 unsigned Index = En.index();
3698 const ReductionInfo &RI = En.value();
3699 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
3700 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
3701 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
3702 }
3703
3704 // Emit a call to the runtime function that orchestrates the reduction.
3705 // Declare the reduction function in the process.
3707 Module *Module = Func->getParent();
3708 uint32_t SrcLocStrSize;
3709 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3710 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
3711 return RI.AtomicReductionGen;
3712 });
3713 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
3714 CanGenerateAtomic
3715 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
3716 : IdentFlag(0));
3717 Value *ThreadId = getOrCreateThreadID(Ident);
3718 Constant *NumVariables = Builder.getInt32(NumReductions);
3719 const DataLayout &DL = Module->getDataLayout();
3720 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
3721 Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
3722 Function *ReductionFunc = getFreshReductionFunc(*Module);
3723 Value *Lock = getOMPCriticalRegionLock(".reduction");
3725 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
3726 : RuntimeFunction::OMPRTL___kmpc_reduce);
3727 CallInst *ReduceCall =
3728 Builder.CreateCall(ReduceFunc,
3729 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
3730 ReductionFunc, Lock},
3731 "reduce");
3732
3733 // Create final reduction entry blocks for the atomic and non-atomic case.
3734 // Emit IR that dispatches control flow to one of the blocks based on the
3735 // reduction supporting the atomic mode.
3736 BasicBlock *NonAtomicRedBlock =
3737 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
3738 BasicBlock *AtomicRedBlock =
3739 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
3740 SwitchInst *Switch =
3741 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
3742 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
3743 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
3744
3745 // Populate the non-atomic reduction using the elementwise reduction function.
3746 // This loads the elements from the global and private variables and reduces
3747 // them before storing back the result to the global variable.
3748 Builder.SetInsertPoint(NonAtomicRedBlock);
3749 for (auto En : enumerate(ReductionInfos)) {
3750 const ReductionInfo &RI = En.value();
3752 // We have one less load for by-ref case because that load is now inside of
3753 // the reduction region
3754 Value *RedValue = RI.Variable;
3755 if (!IsByRef[En.index()]) {
3756 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3757 "red.value." + Twine(En.index()));
3758 }
3759 Value *PrivateRedValue =
3761 "red.private.value." + Twine(En.index()));
3762 Value *Reduced;
3763 InsertPointOrErrorTy AfterIP =
3764 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
3765 if (!AfterIP)
3766 return AfterIP.takeError();
3767 Builder.restoreIP(*AfterIP);
3768
3769 if (!Builder.GetInsertBlock())
3770 return InsertPointTy();
3771 // for by-ref case, the load is inside of the reduction region
3772 if (!IsByRef[En.index()])
3773 Builder.CreateStore(Reduced, RI.Variable);
3774 }
3775 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
3776 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
3777 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
3778 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
3779 Builder.CreateBr(ContinuationBlock);
3780
3781 // Populate the atomic reduction using the atomic elementwise reduction
3782 // function. There are no loads/stores here because they will be happening
3783 // inside the atomic elementwise reduction.
3784 Builder.SetInsertPoint(AtomicRedBlock);
3785 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
3786 for (const ReductionInfo &RI : ReductionInfos) {
3789 if (!AfterIP)
3790 return AfterIP.takeError();
3791 Builder.restoreIP(*AfterIP);
3792 if (!Builder.GetInsertBlock())
3793 return InsertPointTy();
3794 }
3795 Builder.CreateBr(ContinuationBlock);
3796 } else {
3798 }
3799
3800 // Populate the outlined reduction function using the elementwise reduction
3801 // function. Partial values are extracted from the type-erased array of
3802 // pointers to private variables.
3803 BasicBlock *ReductionFuncBlock =
3804 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3805 Builder.SetInsertPoint(ReductionFuncBlock);
3806 Value *LHSArrayPtr = ReductionFunc->getArg(0);
3807 Value *RHSArrayPtr = ReductionFunc->getArg(1);
3808
3809 for (auto En : enumerate(ReductionInfos)) {
3810 const ReductionInfo &RI = En.value();
3812 RedArrayTy, LHSArrayPtr, 0, En.index());
3813 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3814 Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType());
3815 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3817 RedArrayTy, RHSArrayPtr, 0, En.index());
3818 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3819 Value *RHSPtr =
3821 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3822 Value *Reduced;
3823 InsertPointOrErrorTy AfterIP =
3824 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3825 if (!AfterIP)
3826 return AfterIP.takeError();
3827 Builder.restoreIP(*AfterIP);
3828 if (!Builder.GetInsertBlock())
3829 return InsertPointTy();
3830 // store is inside of the reduction region when using by-ref
3831 if (!IsByRef[En.index()])
3832 Builder.CreateStore(Reduced, LHSPtr);
3833 }
3835
3836 Builder.SetInsertPoint(ContinuationBlock);
3837 return Builder.saveIP();
3838}
3839
3842 BodyGenCallbackTy BodyGenCB,
3843 FinalizeCallbackTy FiniCB) {
3844 if (!updateToLocation(Loc))
3845 return Loc.IP;
3846
3847 Directive OMPD = Directive::OMPD_master;
3848 uint32_t SrcLocStrSize;
3849 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3850 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3851 Value *ThreadId = getOrCreateThreadID(Ident);
3852 Value *Args[] = {Ident, ThreadId};
3853
3854 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
3855 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3856
3857 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
3858 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
3859
3860 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3861 /*Conditional*/ true, /*hasFinalize*/ true);
3862}
3863
3866 BodyGenCallbackTy BodyGenCB,
3867 FinalizeCallbackTy FiniCB, Value *Filter) {
3868 if (!updateToLocation(Loc))
3869 return Loc.IP;
3870
3871 Directive OMPD = Directive::OMPD_masked;
3872 uint32_t SrcLocStrSize;
3873 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3874 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3875 Value *ThreadId = getOrCreateThreadID(Ident);
3876 Value *Args[] = {Ident, ThreadId, Filter};
3877 Value *ArgsEnd[] = {Ident, ThreadId};
3878
3879 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
3880 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3881
3882 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
3883 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
3884
3885 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3886 /*Conditional*/ true, /*hasFinalize*/ true);
3887}
3888
3890 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
3891 BasicBlock *PostInsertBefore, const Twine &Name) {
3892 Module *M = F->getParent();
3893 LLVMContext &Ctx = M->getContext();
3894 Type *IndVarTy = TripCount->getType();
3895
3896 // Create the basic block structure.
3897 BasicBlock *Preheader =
3898 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
3899 BasicBlock *Header =
3900 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
3901 BasicBlock *Cond =
3902 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
3903 BasicBlock *Body =
3904 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
3905 BasicBlock *Latch =
3906 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
3907 BasicBlock *Exit =
3908 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
3909 BasicBlock *After =
3910 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
3911
3912 // Use specified DebugLoc for new instructions.
3914
3915 Builder.SetInsertPoint(Preheader);
3916 Builder.CreateBr(Header);
3917
3918 Builder.SetInsertPoint(Header);
3919 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
3920 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
3922
3924 Value *Cmp =
3925 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
3926 Builder.CreateCondBr(Cmp, Body, Exit);
3927
3928 Builder.SetInsertPoint(Body);
3929 Builder.CreateBr(Latch);
3930
3931 Builder.SetInsertPoint(Latch);
3932 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
3933 "omp_" + Name + ".next", /*HasNUW=*/true);
3934 Builder.CreateBr(Header);
3935 IndVarPHI->addIncoming(Next, Latch);
3936
3937 Builder.SetInsertPoint(Exit);
3939
3940 // Remember and return the canonical control flow.
3941 LoopInfos.emplace_front();
3942 CanonicalLoopInfo *CL = &LoopInfos.front();
3943
3944 CL->Header = Header;
3945 CL->Cond = Cond;
3946 CL->Latch = Latch;
3947 CL->Exit = Exit;
3948
3949#ifndef NDEBUG
3950 CL->assertOK();
3951#endif
3952 return CL;
3953}
3954
3957 LoopBodyGenCallbackTy BodyGenCB,
3958 Value *TripCount, const Twine &Name) {
3959 BasicBlock *BB = Loc.IP.getBlock();
3960 BasicBlock *NextBB = BB->getNextNode();
3961
3962 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
3963 NextBB, NextBB, Name);
3964 BasicBlock *After = CL->getAfter();
3965
3966 // If location is not set, don't connect the loop.
3967 if (updateToLocation(Loc)) {
3968 // Split the loop at the insertion point: Branch to the preheader and move
3969 // every following instruction to after the loop (the After BB). Also, the
3970 // new successor is the loop's after block.
3971 spliceBB(Builder, After, /*CreateBranch=*/false);
3973 }
3974
3975 // Emit the body content. We do it after connecting the loop to the CFG to
3976 // avoid that the callback encounters degenerate BBs.
3977 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
3978 return Err;
3979
3980#ifndef NDEBUG
3981 CL->assertOK();
3982#endif
3983 return CL;
3984}
3985
3987 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
3988 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
3989 InsertPointTy ComputeIP, const Twine &Name) {
3990
3991 // Consider the following difficulties (assuming 8-bit signed integers):
3992 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
3993 // DO I = 1, 100, 50
3994 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
3995 // DO I = 100, 0, -128
3996
3997 // Start, Stop and Step must be of the same integer type.
3998 auto *IndVarTy = cast<IntegerType>(Start->getType());
3999 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
4000 assert(IndVarTy == Step->getType() && "Step type mismatch");
4001
4002 LocationDescription ComputeLoc =
4003 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4004 updateToLocation(ComputeLoc);
4005
4006 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
4007 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
4008
4009 // Like Step, but always positive.
4010 Value *Incr = Step;
4011
4012 // Distance between Start and Stop; always positive.
4013 Value *Span;
4014
4015 // Condition whether there are no iterations are executed at all, e.g. because
4016 // UB < LB.
4017 Value *ZeroCmp;
4018
4019 if (IsSigned) {
4020 // Ensure that increment is positive. If not, negate and invert LB and UB.
4021 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
4022 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
4023 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
4024 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
4025 Span = Builder.CreateSub(UB, LB, "", false, true);
4026 ZeroCmp = Builder.CreateICmp(
4027 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
4028 } else {
4029 Span = Builder.CreateSub(Stop, Start, "", true);
4030 ZeroCmp = Builder.CreateICmp(
4031 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
4032 }
4033
4034 Value *CountIfLooping;
4035 if (InclusiveStop) {
4036 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
4037 } else {
4038 // Avoid incrementing past stop since it could overflow.
4039 Value *CountIfTwo = Builder.CreateAdd(
4040 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
4041 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
4042 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
4043 }
4044 Value *TripCount = Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
4045 "omp_" + Name + ".tripcount");
4046
4047 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4048 Builder.restoreIP(CodeGenIP);
4049 Value *Span = Builder.CreateMul(IV, Step);
4050 Value *IndVar = Builder.CreateAdd(Span, Start);
4051 return BodyGenCB(Builder.saveIP(), IndVar);
4052 };
4053 LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP();
4054 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
4055}
4056
4057// Returns an LLVM function to call for initializing loop bounds using OpenMP
4058// static scheduling depending on `type`. Only i32 and i64 are supported by the
4059// runtime. Always interpret integers as unsigned similarly to
4060// CanonicalLoopInfo.
4062 OpenMPIRBuilder &OMPBuilder) {
4063 unsigned Bitwidth = Ty->getIntegerBitWidth();
4064 if (Bitwidth == 32)
4065 return OMPBuilder.getOrCreateRuntimeFunction(
4066 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
4067 if (Bitwidth == 64)
4068 return OMPBuilder.getOrCreateRuntimeFunction(
4069 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
4070 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4071}
4072
4074OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
4075 InsertPointTy AllocaIP,
4076 bool NeedsBarrier) {
4077 assert(CLI->isValid() && "Requires a valid canonical loop");
4078 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4079 "Require dedicated allocate IP");
4080
4081 // Set up the source location value for OpenMP runtime.
4084
4085 uint32_t SrcLocStrSize;
4086 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4087 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4088
4089 // Declare useful OpenMP runtime functions.
4090 Value *IV = CLI->getIndVar();
4091 Type *IVTy = IV->getType();
4092 FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this);
4093 FunctionCallee StaticFini =
4094 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4095
4096 // Allocate space for computed loop bounds as expected by the "init" function.
4097 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4098
4099 Type *I32Type = Type::getInt32Ty(M.getContext());
4100 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4101 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4102 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4103 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4104
4105 // At the end of the preheader, prepare for calling the "init" function by
4106 // storing the current loop bounds into the allocated space. A canonical loop
4107 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4108 // and produces an inclusive upper bound.
4110 Constant *Zero = ConstantInt::get(IVTy, 0);
4111 Constant *One = ConstantInt::get(IVTy, 1);
4112 Builder.CreateStore(Zero, PLowerBound);
4113 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
4114 Builder.CreateStore(UpperBound, PUpperBound);
4115 Builder.CreateStore(One, PStride);
4116
4117 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4118
4119 Constant *SchedulingType = ConstantInt::get(
4120 I32Type, static_cast<int>(OMPScheduleType::UnorderedStatic));
4121
4122 // Call the "init" function and update the trip count of the loop with the
4123 // value it produced.
4124 Builder.CreateCall(StaticInit,
4125 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
4126 PUpperBound, PStride, One, Zero});
4127 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
4128 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
4129 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
4130 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
4131 CLI->setTripCount(TripCount);
4132
4133 // Update all uses of the induction variable except the one in the condition
4134 // block that compares it with the actual upper bound, and the increment in
4135 // the latch block.
4136
4137 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
4139 CLI->getBody()->getFirstInsertionPt());
4141 return Builder.CreateAdd(OldIV, LowerBound);
4142 });
4143
4144 // In the "exit" block, call the "fini" function.
4146 CLI->getExit()->getTerminator()->getIterator());
4147 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4148
4149 // Add the barrier if requested.
4150 if (NeedsBarrier) {
4151 InsertPointOrErrorTy BarrierIP =
4152 createBarrier(LocationDescription(Builder.saveIP(), DL),
4153 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4154 /* CheckCancelFlag */ false);
4155 if (!BarrierIP)
4156 return BarrierIP.takeError();
4157 }
4158
4159 InsertPointTy AfterIP = CLI->getAfterIP();
4160 CLI->invalidate();
4161
4162 return AfterIP;
4163}
4164
4166OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL,
4167 CanonicalLoopInfo *CLI,
4168 InsertPointTy AllocaIP,
4169 bool NeedsBarrier,
4170 Value *ChunkSize) {
4171 assert(CLI->isValid() && "Requires a valid canonical loop");
4172 assert(ChunkSize && "Chunk size is required");
4173
4174 LLVMContext &Ctx = CLI->getFunction()->getContext();
4175 Value *IV = CLI->getIndVar();
4176 Value *OrigTripCount = CLI->getTripCount();
4177 Type *IVTy = IV->getType();
4178 assert(IVTy->getIntegerBitWidth() <= 64 &&
4179 "Max supported tripcount bitwidth is 64 bits");
4180 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4181 : Type::getInt64Ty(Ctx);
4182 Type *I32Type = Type::getInt32Ty(M.getContext());
4183 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4184 Constant *One = ConstantInt::get(InternalIVTy, 1);
4185
4186 // Declare useful OpenMP runtime functions.
4187 FunctionCallee StaticInit =
4188 getKmpcForStaticInitForType(InternalIVTy, M, *this);
4189 FunctionCallee StaticFini =
4190 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4191
4192 // Allocate space for computed loop bounds as expected by the "init" function.
4193 Builder.restoreIP(AllocaIP);
4195 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4196 Value *PLowerBound =
4197 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
4198 Value *PUpperBound =
4199 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
4200 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
4201
4202 // Set up the source location value for the OpenMP runtime.
4205
4206 // TODO: Detect overflow in ubsan or max-out with current tripcount.
4207 Value *CastedChunkSize =
4208 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
4209 Value *CastedTripCount =
4210 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
4211
4212 Constant *SchedulingType = ConstantInt::get(
4213 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
4214 Builder.CreateStore(Zero, PLowerBound);
4215 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
4216 Builder.CreateStore(OrigUpperBound, PUpperBound);
4217 Builder.CreateStore(One, PStride);
4218
4219 // Call the "init" function and update the trip count of the loop with the
4220 // value it produced.
4221 uint32_t SrcLocStrSize;
4222 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4223 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4224 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4225 Builder.CreateCall(StaticInit,
4226 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
4227 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
4228 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
4229 /*pstride=*/PStride, /*incr=*/One,
4230 /*chunk=*/CastedChunkSize});
4231
4232 // Load values written by the "init" function.
4233 Value *FirstChunkStart =
4234 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
4235 Value *FirstChunkStop =
4236 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
4237 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
4238 Value *ChunkRange =
4239 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
4240 Value *NextChunkStride =
4241 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
4242
4243 // Create outer "dispatch" loop for enumerating the chunks.
4244 BasicBlock *DispatchEnter = splitBB(Builder, true);
4245 Value *DispatchCounter;
4247 {Builder.saveIP(), DL},
4248 [&](InsertPointTy BodyIP, Value *Counter) {
4249 DispatchCounter = Counter;
4250 return Error::success();
4251 },
4252 FirstChunkStart, CastedTripCount, NextChunkStride,
4253 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
4254 "dispatch");
4255 if (!LoopResult) {
4256 // It is safe to assume this didn't return an error because the callback
4257 // passed into createCanonicalLoop is the only possible error source, and it
4258 // always returns success. Need to still cast the result into bool to avoid
4259 // runtime errors.
4260 llvm_unreachable("unexpected error creating canonical loop");
4261 }
4262 CanonicalLoopInfo *DispatchCLI = *LoopResult;
4263
4264 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
4265 // not have to preserve the canonical invariant.
4266 BasicBlock *DispatchBody = DispatchCLI->getBody();
4267 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
4268 BasicBlock *DispatchExit = DispatchCLI->getExit();
4269 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
4270 DispatchCLI->invalidate();
4271
4272 // Rewire the original loop to become the chunk loop inside the dispatch loop.
4273 redirectTo(DispatchAfter, CLI->getAfter(), DL);
4274 redirectTo(CLI->getExit(), DispatchLatch, DL);
4275 redirectTo(DispatchBody, DispatchEnter, DL);
4276
4277 // Prepare the prolog of the chunk loop.
4280
4281 // Compute the number of iterations of the chunk loop.
4283 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
4284 Value *IsLastChunk =
4285 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
4286 Value *CountUntilOrigTripCount =
4287 Builder.CreateSub(CastedTripCount, DispatchCounter);
4288 Value *ChunkTripCount = Builder.CreateSelect(
4289 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
4290 Value *BackcastedChunkTC =
4291 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
4292 CLI->setTripCount(BackcastedChunkTC);
4293
4294 // Update all uses of the induction variable except the one in the condition
4295 // block that compares it with the actual upper bound, and the increment in
4296 // the latch block.
4297 Value *BackcastedDispatchCounter =
4298 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
4299 CLI->mapIndVar([&](Instruction *) -> Value * {
4300 Builder.restoreIP(CLI->getBodyIP());
4301 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
4302 });
4303
4304 // In the "exit" block, call the "fini" function.
4305 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
4306 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4307
4308 // Add the barrier if requested.
4309 if (NeedsBarrier) {
4310 InsertPointOrErrorTy AfterIP =
4311 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
4312 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
4313 if (!AfterIP)
4314 return AfterIP.takeError();
4315 }
4316
4317#ifndef NDEBUG
4318 // Even though we currently do not support applying additional methods to it,
4319 // the chunk loop should remain a canonical loop.
4320 CLI->assertOK();
4321#endif
4322
4323 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
4324}
4325
4326// Returns an LLVM function to call for executing an OpenMP static worksharing
4327// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
4328// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
4329static FunctionCallee
4331 WorksharingLoopType LoopType) {
4332 unsigned Bitwidth = Ty->getIntegerBitWidth();
4333 Module &M = OMPBuilder->M;
4334 switch (LoopType) {
4335 case WorksharingLoopType::ForStaticLoop:
4336 if (Bitwidth == 32)
4337 return OMPBuilder->getOrCreateRuntimeFunction(
4338 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
4339 if (Bitwidth == 64)
4340 return OMPBuilder->getOrCreateRuntimeFunction(
4341 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
4342 break;
4343 case WorksharingLoopType::DistributeStaticLoop:
4344 if (Bitwidth == 32)
4345 return OMPBuilder->getOrCreateRuntimeFunction(
4346 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
4347 if (Bitwidth == 64)
4348 return OMPBuilder->getOrCreateRuntimeFunction(
4349 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
4350 break;
4351 case WorksharingLoopType::DistributeForStaticLoop:
4352 if (Bitwidth == 32)
4353 return OMPBuilder->getOrCreateRuntimeFunction(
4354 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
4355 if (Bitwidth == 64)
4356 return OMPBuilder->getOrCreateRuntimeFunction(
4357 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
4358 break;
4359 }
4360 if (Bitwidth != 32 && Bitwidth != 64) {
4361 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
4362 }
4363 llvm_unreachable("Unknown type of OpenMP worksharing loop");
4364}
4365
4366// Inserts a call to proper OpenMP Device RTL function which handles
4367// loop worksharing.
4369 OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
4370 BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
4371 Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
4372 Type *TripCountTy = TripCount->getType();
4373 Module &M = OMPBuilder->M;
4374 IRBuilder<> &Builder = OMPBuilder->Builder;
4375 FunctionCallee RTLFn =
4376 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
4377 SmallVector<Value *, 8> RealArgs;
4378 RealArgs.push_back(Ident);
4379 RealArgs.push_back(Builder.CreateBitCast(&LoopBodyFn, ParallelTaskPtr));
4380 RealArgs.push_back(LoopBodyArg);
4381 RealArgs.push_back(TripCount);
4382 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
4383 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4384 Builder.CreateCall(RTLFn, RealArgs);
4385 return;
4386 }
4387 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
4388 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
4389 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
4390 Value *NumThreads = Builder.CreateCall(RTLNumThreads, {});
4391
4392 RealArgs.push_back(
4393 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
4394 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4395 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4396 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4397 }
4398
4399 Builder.CreateCall(RTLFn, RealArgs);
4400}
4401
4402static void
4404 CanonicalLoopInfo *CLI, Value *Ident,
4405 Function &OutlinedFn, Type *ParallelTaskPtr,
4406 const SmallVector<Instruction *, 4> &ToBeDeleted,
4407 WorksharingLoopType LoopType) {
4408 IRBuilder<> &Builder = OMPIRBuilder->Builder;
4409 BasicBlock *Preheader = CLI->getPreheader();
4410 Value *TripCount = CLI->getTripCount();
4411
4412 // After loop body outling, the loop body contains only set up
4413 // of loop body argument structure and the call to the outlined
4414 // loop body function. Firstly, we need to move setup of loop body args
4415 // into loop preheader.
4416 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
4417 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
4418
4419 // The next step is to remove the whole loop. We do not it need anymore.
4420 // That's why make an unconditional branch from loop preheader to loop
4421 // exit block
4422 Builder.restoreIP({Preheader, Preheader->end()});
4423 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
4424 Preheader->getTerminator()->eraseFromParent();
4425 Builder.CreateBr(CLI->getExit());
4426
4427 // Delete dead loop blocks
4428 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
4429 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
4430 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
4431 CleanUpInfo.EntryBB = CLI->getHeader();
4432 CleanUpInfo.ExitBB = CLI->getExit();
4433 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
4434 DeleteDeadBlocks(BlocksToBeRemoved);
4435
4436 // Find the instruction which corresponds to loop body argument structure
4437 // and remove the call to loop body function instruction.
4438 Value *LoopBodyArg;
4439 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
4440 assert(OutlinedFnUser &&
4441 "Expected unique undroppable user of outlined function");
4442 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
4443 assert(OutlinedFnCallInstruction && "Expected outlined function call");
4444 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
4445 "Expected outlined function call to be located in loop preheader");
4446 // Check in case no argument structure has been passed.
4447 if (OutlinedFnCallInstruction->arg_size() > 1)
4448 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
4449 else
4450 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
4451 OutlinedFnCallInstruction->eraseFromParent();
4452
4453 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
4454 LoopBodyArg, ParallelTaskPtr, TripCount,
4455 OutlinedFn);
4456
4457 for (auto &ToBeDeletedItem : ToBeDeleted)
4458 ToBeDeletedItem->eraseFromParent();
4459 CLI->invalidate();
4460}
4461
4463OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
4464 InsertPointTy AllocaIP,
4465 WorksharingLoopType LoopType) {
4466 uint32_t SrcLocStrSize;
4467 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4468 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4469
4470 OutlineInfo OI;
4471 OI.OuterAllocaBB = CLI->getPreheader();
4472 Function *OuterFn = CLI->getPreheader()->getParent();
4473
4474 // Instructions which need to be deleted at the end of code generation
4476
4477 OI.OuterAllocaBB = AllocaIP.getBlock();
4478
4479 // Mark the body loop as region which needs to be extracted
4480 OI.EntryBB = CLI->getBody();
4481 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
4482 "omp.prelatch", true);
4483
4484 // Prepare loop body for extraction
4485 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
4486
4487 // Insert new loop counter variable which will be used only in loop
4488 // body.
4489 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
4490 Instruction *NewLoopCntLoad =
4491 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
4492 // New loop counter instructions are redundant in the loop preheader when
4493 // code generation for workshare loop is finshed. That's why mark them as
4494 // ready for deletion.
4495 ToBeDeleted.push_back(NewLoopCntLoad);
4496 ToBeDeleted.push_back(NewLoopCnt);
4497
4498 // Analyse loop body region. Find all input variables which are used inside
4499 // loop body region.
4500 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
4502 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
4503 SmallVector<BasicBlock *, 32> BlocksT(ParallelRegionBlockSet.begin(),
4504 ParallelRegionBlockSet.end());
4505
4506 CodeExtractorAnalysisCache CEAC(*OuterFn);
4507 CodeExtractor Extractor(Blocks,
4508 /* DominatorTree */ nullptr,
4509 /* AggregateArgs */ true,
4510 /* BlockFrequencyInfo */ nullptr,
4511 /* BranchProbabilityInfo */ nullptr,
4512 /* AssumptionCache */ nullptr,
4513 /* AllowVarArgs */ true,
4514 /* AllowAlloca */ true,
4515 /* AllocationBlock */ CLI->getPreheader(),
4516 /* Suffix */ ".omp_wsloop",
4517 /* AggrArgsIn0AddrSpace */ true);
4518
4519 BasicBlock *CommonExit = nullptr;
4520 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
4521
4522 // Find allocas outside the loop body region which are used inside loop
4523 // body
4524 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
4525
4526 // We need to model loop body region as the function f(cnt, loop_arg).
4527 // That's why we replace loop induction variable by the new counter
4528 // which will be one of loop body function argument
4530 CLI->getIndVar()->user_end());
4531 for (auto Use : Users) {
4532 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
4533 if (ParallelRegionBlockSet.count(Inst->getParent())) {
4534 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
4535 }
4536 }
4537 }
4538 // Make sure that loop counter variable is not merged into loop body
4539 // function argument structure and it is passed as separate variable
4540 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
4541
4542 // PostOutline CB is invoked when loop body function is outlined and
4543 // loop body is replaced by call to outlined function. We need to add
4544 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
4545 // function will handle loop control logic.
4546 //
4547 OI.PostOutlineCB = [=, ToBeDeletedVec =
4548 std::move(ToBeDeleted)](Function &OutlinedFn) {
4549 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ParallelTaskPtr,
4550 ToBeDeletedVec, LoopType);
4551 };
4552 addOutlineInfo(std::move(OI));
4553 return CLI->getAfterIP();
4554}
4555
4558 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
4559 bool HasSimdModifier, bool HasMonotonicModifier,
4560 bool HasNonmonotonicModifier, bool HasOrderedClause,
4561 WorksharingLoopType LoopType) {
4562 if (Config.isTargetDevice())
4563 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
4564 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
4565 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
4566 HasNonmonotonicModifier, HasOrderedClause);
4567
4568 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
4569 OMPScheduleType::ModifierOrdered;
4570 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
4571 case OMPScheduleType::BaseStatic:
4572 assert(!ChunkSize && "No chunk size with static-chunked schedule");
4573 if (IsOrdered)
4574 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4575 NeedsBarrier, ChunkSize);
4576 // FIXME: Monotonicity ignored?
4577 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier);
4578
4579 case OMPScheduleType::BaseStaticChunked:
4580 if (IsOrdered)
4581 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4582 NeedsBarrier, ChunkSize);
4583 // FIXME: Monotonicity ignored?
4584 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
4585 ChunkSize);
4586
4587 case OMPScheduleType::BaseRuntime:
4588 case OMPScheduleType::BaseAuto:
4589 case OMPScheduleType::BaseGreedy:
4590 case OMPScheduleType::BaseBalanced:
4591 case OMPScheduleType::BaseSteal:
4592 case OMPScheduleType::BaseGuidedSimd:
4593 case OMPScheduleType::BaseRuntimeSimd:
4594 assert(!ChunkSize &&
4595 "schedule type does not support user-defined chunk sizes");
4596 [[fallthrough]];
4597 case OMPScheduleType::BaseDynamicChunked:
4598 case OMPScheduleType::BaseGuidedChunked:
4599 case OMPScheduleType::BaseGuidedIterativeChunked:
4600 case OMPScheduleType::BaseGuidedAnalyticalChunked:
4601 case OMPScheduleType::BaseStaticBalancedChunked:
4602 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4603 NeedsBarrier, ChunkSize);
4604
4605 default:
4606 llvm_unreachable("Unknown/unimplemented schedule kind");
4607 }
4608}
4609
4610/// Returns an LLVM function to call for initializing loop bounds using OpenMP
4611/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
4612/// the runtime. Always interpret integers as unsigned similarly to
4613/// CanonicalLoopInfo.
4614static FunctionCallee
4616 unsigned Bitwidth = Ty->getIntegerBitWidth();
4617 if (Bitwidth == 32)
4618 return OMPBuilder.getOrCreateRuntimeFunction(
4619 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
4620 if (Bitwidth == 64)
4621 return OMPBuilder.getOrCreateRuntimeFunction(
4622 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
4623 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4624}
4625
4626/// Returns an LLVM function to call for updating the next loop using OpenMP
4627/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
4628/// the runtime. Always interpret integers as unsigned similarly to
4629/// CanonicalLoopInfo.
4630static FunctionCallee
4632 unsigned Bitwidth = Ty->getIntegerBitWidth();
4633 if (Bitwidth == 32)
4634 return OMPBuilder.getOrCreateRuntimeFunction(
4635 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
4636 if (Bitwidth == 64)
4637 return OMPBuilder.getOrCreateRuntimeFunction(
4638 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
4639 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4640}
4641
4642/// Returns an LLVM function to call for finalizing the dynamic loop using
4643/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
4644/// interpret integers as unsigned similarly to CanonicalLoopInfo.
4645static FunctionCallee
4647 unsigned Bitwidth = Ty->getIntegerBitWidth();
4648 if (Bitwidth == 32)
4649 return OMPBuilder.getOrCreateRuntimeFunction(
4650 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
4651 if (Bitwidth == 64)
4652 return OMPBuilder.getOrCreateRuntimeFunction(
4653 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
4654 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4655}
4656
4658OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
4659 InsertPointTy AllocaIP,
4660 OMPScheduleType SchedType,
4661 bool NeedsBarrier, Value *Chunk) {
4662 assert(CLI->isValid() && "Requires a valid canonical loop");
4663 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4664 "Require dedicated allocate IP");
4666 "Require valid schedule type");
4667
4668 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
4669 OMPScheduleType::ModifierOrdered;
4670
4671 // Set up the source location value for OpenMP runtime.
4673
4674 uint32_t SrcLocStrSize;
4675 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4676 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4677
4678 // Declare useful OpenMP runtime functions.
4679 Value *IV = CLI->getIndVar();
4680 Type *IVTy = IV->getType();
4681 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
4682 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
4683
4684 // Allocate space for computed loop bounds as expected by the "init" function.
4685 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4686 Type *I32Type = Type::getInt32Ty(M.getContext());
4687 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4688 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4689 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4690 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4691
4692 // At the end of the preheader, prepare for calling the "init" function by
4693 // storing the current loop bounds into the allocated space. A canonical loop
4694 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4695 // and produces an inclusive upper bound.
4696 BasicBlock *PreHeader = CLI->getPreheader();
4697 Builder.SetInsertPoint(PreHeader->getTerminator());
4698 Constant *One = ConstantInt::get(IVTy, 1);
4699 Builder.CreateStore(One, PLowerBound);
4700 Value *UpperBound = CLI->getTripCount();
4701 Builder.CreateStore(UpperBound, PUpperBound);
4702 Builder.CreateStore(One, PStride);
4703
4704 BasicBlock *Header = CLI->getHeader();
4705 BasicBlock *Exit = CLI->getExit();
4706 BasicBlock *Cond = CLI->getCond();
4707 BasicBlock *Latch = CLI->getLatch();
4708 InsertPointTy AfterIP = CLI->getAfterIP();
4709
4710 // The CLI will be "broken" in the code below, as the loop is no longer
4711 // a valid canonical loop.
4712
4713 if (!Chunk)
4714 Chunk = One;
4715
4716 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4717
4718 Constant *SchedulingType =
4719 ConstantInt::get(I32Type, static_cast<int>(SchedType));
4720
4721 // Call the "init" function.
4722 Builder.CreateCall(DynamicInit,
4723 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
4724 UpperBound, /* step */ One, Chunk});
4725
4726 // An outer loop around the existing one.
4727 BasicBlock *OuterCond = BasicBlock::Create(
4728 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
4729 PreHeader->getParent());
4730 // This needs to be 32-bit always, so can't use the IVTy Zero above.
4731 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
4732 Value *Res =
4733 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
4734 PLowerBound, PUpperBound, PStride});
4735 Constant *Zero32 = ConstantInt::get(I32Type, 0);
4736 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
4737 Value *LowerBound =
4738 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
4739 Builder.CreateCondBr(MoreWork, Header, Exit);
4740
4741 // Change PHI-node in loop header to use outer cond rather than preheader,
4742 // and set IV to the LowerBound.
4743 Instruction *Phi = &Header->front();
4744 auto *PI = cast<PHINode>(Phi);
4745 PI->setIncomingBlock(0, OuterCond);
4746 PI->setIncomingValue(0, LowerBound);
4747
4748 // Then set the pre-header to jump to the OuterCond
4749 Instruction *Term = PreHeader->getTerminator();
4750 auto *Br = cast<BranchInst>(Term);
4751 Br->setSuccessor(0, OuterCond);
4752
4753 // Modify the inner condition:
4754 // * Use the UpperBound returned from the DynamicNext call.
4755 // * jump to the loop outer loop when done with one of the inner loops.
4756 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
4757 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
4759 auto *CI = cast<CmpInst>(Comp);
4760 CI->setOperand(1, UpperBound);
4761 // Redirect the inner exit to branch to outer condition.
4762 Instruction *Branch = &Cond->back();
4763 auto *BI = cast<BranchInst>(Branch);
4764 assert(BI->getSuccessor(1) == Exit);
4765 BI->setSuccessor(1, OuterCond);
4766
4767 // Call the "fini" function if "ordered" is present in wsloop directive.
4768 if (Ordered) {
4769 Builder.SetInsertPoint(&Latch->back());
4770 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
4771 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
4772 }
4773
4774 // Add the barrier if requested.
4775 if (NeedsBarrier) {
4776 Builder.SetInsertPoint(&Exit->back());
4777 InsertPointOrErrorTy BarrierIP =
4778 createBarrier(LocationDescription(Builder.saveIP(), DL),
4779 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4780 /* CheckCancelFlag */ false);
4781 if (!BarrierIP)
4782 return BarrierIP.takeError();
4783 }
4784
4785 CLI->invalidate();
4786 return AfterIP;
4787}
4788
4789/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
4790/// after this \p OldTarget will be orphaned.
4792 BasicBlock *NewTarget, DebugLoc DL) {
4793 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
4794 redirectTo(Pred, NewTarget, DL);
4795}
4796
4797/// Determine which blocks in \p BBs are reachable from outside and remove the
4798/// ones that are not reachable from the function.
4800 SmallPtrSet<BasicBlock *, 6> BBsToErase{BBs.begin(), BBs.end()};
4801 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
4802 for (Use &U : BB->uses()) {
4803 auto *UseInst = dyn_cast<Instruction>(U.getUser());
4804 if (!UseInst)
4805 continue;
4806 if (BBsToErase.count(UseInst->getParent()))
4807 continue;
4808 return true;
4809 }
4810 return false;
4811 };
4812
4813 while (BBsToErase.remove_if(HasRemainingUses)) {
4814 // Try again if anything was removed.
4815 }
4816
4817 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
4818 DeleteDeadBlocks(BBVec);
4819}
4820
4823 InsertPointTy ComputeIP) {
4824 assert(Loops.size() >= 1 && "At least one loop required");
4825 size_t NumLoops = Loops.size();
4826
4827 // Nothing to do if there is already just one loop.
4828 if (NumLoops == 1)
4829 return Loops.front();
4830
4831 CanonicalLoopInfo *Outermost = Loops.front();
4832 CanonicalLoopInfo *Innermost = Loops.back();
4833 BasicBlock *OrigPreheader = Outermost->getPreheader();
4834 BasicBlock *OrigAfter = Outermost->getAfter();
4835 Function *F = OrigPreheader->getParent();
4836
4837 // Loop control blocks that may become orphaned later.
4838 SmallVector<BasicBlock *, 12> OldControlBBs;
4839 OldControlBBs.reserve(6 * Loops.size());
4841 Loop->collectControlBlocks(OldControlBBs);
4842
4843 // Setup the IRBuilder for inserting the trip count computation.
4845 if (ComputeIP.isSet())
4846 Builder.restoreIP(ComputeIP);
4847 else
4848 Builder.restoreIP(Outermost->getPreheaderIP());
4849
4850 // Derive the collapsed' loop trip count.
4851 // TODO: Find common/largest indvar type.
4852 Value *CollapsedTripCount = nullptr;
4853 for (CanonicalLoopInfo *L : Loops) {
4854 assert(L->isValid() &&
4855 "All loops to collapse must be valid canonical loops");
4856 Value *OrigTripCount = L->getTripCount();
4857 if (!CollapsedTripCount) {
4858 CollapsedTripCount = OrigTripCount;
4859 continue;
4860 }
4861
4862 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
4863 CollapsedTripCount = Builder.CreateMul(CollapsedTripCount, OrigTripCount,
4864 {}, /*HasNUW=*/true);
4865 }
4866
4867 // Create the collapsed loop control flow.
4868 CanonicalLoopInfo *Result =
4869 createLoopSkeleton(DL, CollapsedTripCount, F,
4870 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
4871
4872 // Build the collapsed loop body code.
4873 // Start with deriving the input loop induction variables from the collapsed
4874 // one, using a divmod scheme. To preserve the original loops' order, the
4875 // innermost loop use the least significant bits.
4876 Builder.restoreIP(Result->getBodyIP());
4877
4878 Value *Leftover = Result->getIndVar();
4879 SmallVector<Value *> NewIndVars;
4880 NewIndVars.resize(NumLoops);
4881 for (int i = NumLoops - 1; i >= 1; --i) {
4882 Value *OrigTripCount = Loops[i]->getTripCount();
4883
4884 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
4885 NewIndVars[i] = NewIndVar;
4886
4887 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
4888 }
4889 // Outermost loop gets all the remaining bits.
4890 NewIndVars[0] = Leftover;
4891
4892 // Construct the loop body control flow.
4893 // We progressively construct the branch structure following in direction of
4894 // the control flow, from the leading in-between code, the loop nest body, the
4895 // trailing in-between code, and rejoining the collapsed loop's latch.
4896 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
4897 // the ContinueBlock is set, continue with that block. If ContinuePred, use
4898 // its predecessors as sources.
4899 BasicBlock *ContinueBlock = Result->getBody();
4900 BasicBlock *ContinuePred = nullptr;
4901 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
4902 BasicBlock *NextSrc) {
4903 if (ContinueBlock)
4904 redirectTo(ContinueBlock, Dest, DL);
4905 else
4906 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
4907
4908 ContinueBlock = nullptr;
4909 ContinuePred = NextSrc;
4910 };
4911
4912 // The code before the nested loop of each level.
4913 // Because we are sinking it into the nest, it will be executed more often
4914 // that the original loop. More sophisticated schemes could keep track of what
4915 // the in-between code is and instantiate it only once per thread.
4916 for (size_t i = 0; i < NumLoops - 1; ++i)
4917 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
4918
4919 // Connect the loop nest body.
4920 ContinueWith(Innermost->getBody(), Innermost->getLatch());
4921
4922 // The code after the nested loop at each level.
4923 for (size_t i = NumLoops - 1; i > 0; --i)
4924 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
4925
4926 // Connect the finished loop to the collapsed loop latch.
4927 ContinueWith(Result->getLatch(), nullptr);
4928
4929 // Replace the input loops with the new collapsed loop.
4930 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
4931 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
4932
4933 // Replace the input loop indvars with the derived ones.
4934 for (size_t i = 0; i < NumLoops; ++i)
4935 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
4936
4937 // Remove unused parts of the input loops.
4938 removeUnusedBlocksFromParent(OldControlBBs);
4939
4940 for (CanonicalLoopInfo *L : Loops)
4941 L->invalidate();
4942
4943#ifndef NDEBUG
4944 Result->assertOK();
4945#endif
4946 return Result;
4947}
4948
4949std::vector<CanonicalLoopInfo *>
4951 ArrayRef<Value *> TileSizes) {
4952 assert(TileSizes.size() == Loops.size() &&
4953 "Must pass as many tile sizes as there are loops");
4954 int NumLoops = Loops.size();
4955 assert(NumLoops >= 1 && "At least one loop to tile required");
4956
4957 CanonicalLoopInfo *OutermostLoop = Loops.front();
4958 CanonicalLoopInfo *InnermostLoop = Loops.back();
4959 Function *F = OutermostLoop->getBody()->getParent();
4960 BasicBlock *InnerEnter = InnermostLoop->getBody();
4961 BasicBlock *InnerLatch = InnermostLoop->getLatch();
4962
4963 // Loop control blocks that may become orphaned later.
4964 SmallVector<BasicBlock *, 12> OldControlBBs;
4965 OldControlBBs.reserve(6 * Loops.size());
4967 Loop->collectControlBlocks(OldControlBBs);
4968
4969 // Collect original trip counts and induction variable to be accessible by
4970 // index. Also, the structure of the original loops is not preserved during
4971 // the construction of the tiled loops, so do it before we scavenge the BBs of
4972 // any original CanonicalLoopInfo.
4973 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
4974 for (CanonicalLoopInfo *L : Loops) {
4975 assert(L->isValid() && "All input loops must be valid canonical loops");
4976 OrigTripCounts.push_back(L->getTripCount());
4977 OrigIndVars.push_back(L->getIndVar());
4978 }
4979
4980 // Collect the code between loop headers. These may contain SSA definitions
4981 // that are used in the loop nest body. To be usable with in the innermost
4982 // body, these BasicBlocks will be sunk into the loop nest body. That is,
4983 // these instructions may be executed more often than before the tiling.
4984 // TODO: It would be sufficient to only sink them into body of the
4985 // corresponding tile loop.
4987 for (int i = 0; i < NumLoops - 1; ++i) {
4988 CanonicalLoopInfo *Surrounding = Loops[i];
4989 CanonicalLoopInfo *Nested = Loops[i + 1];
4990
4991 BasicBlock *EnterBB = Surrounding->getBody();
4992 BasicBlock *ExitBB = Nested->getHeader();
4993 InbetweenCode.emplace_back(EnterBB, ExitBB);
4994 }
4995
4996 // Compute the trip counts of the floor loops.
4998 Builder.restoreIP(OutermostLoop->getPreheaderIP());
4999 SmallVector<Value *, 4> FloorCount, FloorRems;
5000 for (int i = 0; i < NumLoops; ++i) {
5001 Value *TileSize = TileSizes[i];
5002 Value *OrigTripCount = OrigTripCounts[i];
5003 Type *IVType = OrigTripCount->getType();
5004
5005 Value *FloorTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
5006 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
5007
5008 // 0 if tripcount divides the tilesize, 1 otherwise.
5009 // 1 means we need an additional iteration for a partial tile.
5010 //
5011 // Unfortunately we cannot just use the roundup-formula
5012 // (tripcount + tilesize - 1)/tilesize
5013 // because the summation might overflow. We do not want introduce undefined
5014 // behavior when the untiled loop nest did not.
5015 Value *FloorTripOverflow =
5016 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
5017
5018 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
5019 FloorTripCount =
5020 Builder.CreateAdd(FloorTripCount, FloorTripOverflow,
5021 "omp_floor" + Twine(i) + ".tripcount", true);
5022
5023 // Remember some values for later use.
5024 FloorCount.push_back(FloorTripCount);
5025 FloorRems.push_back(FloorTripRem);
5026 }
5027
5028 // Generate the new loop nest, from the outermost to the innermost.
5029 std::vector<CanonicalLoopInfo *> Result;
5030 Result.reserve(NumLoops * 2);
5031
5032 // The basic block of the surrounding loop that enters the nest generated
5033 // loop.
5034 BasicBlock *Enter = OutermostLoop->getPreheader();
5035
5036 // The basic block of the surrounding loop where the inner code should
5037 // continue.
5038 BasicBlock *Continue = OutermostLoop->getAfter();
5039
5040 // Where the next loop basic block should be inserted.
5041 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
5042
5043 auto EmbeddNewLoop =
5044 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
5045 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
5046 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
5047 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
5048 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
5049 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
5050
5051 // Setup the position where the next embedded loop connects to this loop.
5052 Enter = EmbeddedLoop->getBody();
5053 Continue = EmbeddedLoop->getLatch();
5054 OutroInsertBefore = EmbeddedLoop->getLatch();
5055 return EmbeddedLoop;
5056 };
5057
5058 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
5059 const Twine &NameBase) {
5060 for (auto P : enumerate(TripCounts)) {
5061 CanonicalLoopInfo *EmbeddedLoop =
5062 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
5063 Result.push_back(EmbeddedLoop);
5064 }
5065 };
5066
5067 EmbeddNewLoops(FloorCount, "floor");
5068
5069 // Within the innermost floor loop, emit the code that computes the tile
5070 // sizes.
5072 SmallVector<Value *, 4> TileCounts;
5073 for (int i = 0; i < NumLoops; ++i) {
5074 CanonicalLoopInfo *FloorLoop = Result[i];
5075 Value *TileSize = TileSizes[i];
5076
5077 Value *FloorIsEpilogue =
5078 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCount[i]);
5079 Value *TileTripCount =
5080 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
5081
5082 TileCounts.push_back(TileTripCount);
5083 }
5084
5085 // Create the tile loops.
5086 EmbeddNewLoops(TileCounts, "tile");
5087
5088 // Insert the inbetween code into the body.
5089 BasicBlock *BodyEnter = Enter;
5090 BasicBlock *BodyEntered = nullptr;
5091 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
5092 BasicBlock *EnterBB = P.first;
5093 BasicBlock *ExitBB = P.second;
5094
5095 if (BodyEnter)
5096 redirectTo(BodyEnter, EnterBB, DL);
5097 else
5098 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
5099
5100 BodyEnter = nullptr;
5101 BodyEntered = ExitBB;
5102 }
5103
5104 // Append the original loop nest body into the generated loop nest body.
5105 if (BodyEnter)
5106 redirectTo(BodyEnter, InnerEnter, DL);
5107 else
5108 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
5110
5111 // Replace the original induction variable with an induction variable computed
5112 // from the tile and floor induction variables.
5113 Builder.restoreIP(Result.back()->getBodyIP());
5114 for (int i = 0; i < NumLoops; ++i) {
5115 CanonicalLoopInfo *FloorLoop = Result[i];
5116 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
5117 Value *OrigIndVar = OrigIndVars[i];
5118 Value *Size = TileSizes[i];
5119
5120 Value *Scale =
5121 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
5122 Value *Shift =
5123 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
5124 OrigIndVar->replaceAllUsesWith(Shift);
5125 }
5126
5127 // Remove unused parts of the original loops.
5128 removeUnusedBlocksFromParent(OldControlBBs);
5129
5130 for (CanonicalLoopInfo *L : Loops)
5131 L->invalidate();
5132
5133#ifndef NDEBUG
5134 for (CanonicalLoopInfo *GenL : Result)
5135 GenL->assertOK();
5136#endif
5137 return Result;
5138}
5139
5140/// Attach metadata \p Properties to the basic block described by \p BB. If the
5141/// basic block already has metadata, the basic block properties are appended.
5143 ArrayRef<Metadata *> Properties) {
5144 // Nothing to do if no property to attach.
5145 if (Properties.empty())
5146 return;
5147
5148 LLVMContext &Ctx = BB->getContext();
5149 SmallVector<Metadata *> NewProperties;
5150 NewProperties.push_back(nullptr);
5151
5152 // If the basic block already has metadata, prepend it to the new metadata.
5153 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
5154 if (Existing)
5155 append_range(NewProperties, drop_begin(Existing->operands(), 1));
5156
5157 append_range(NewProperties, Properties);
5158 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
5159 BasicBlockID->replaceOperandWith(0, BasicBlockID);
5160
5161 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
5162}
5163
5164/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
5165/// loop already has metadata, the loop properties are appended.
5167 ArrayRef<Metadata *> Properties) {
5168 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
5169
5170 // Attach metadata to the loop's latch
5171 BasicBlock *Latch = Loop->getLatch();
5172 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
5173 addBasicBlockMetadata(Latch, Properties);
5174}
5175
5176/// Attach llvm.access.group metadata to the memref instructions of \p Block
5177static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
5178 LoopInfo &LI) {
5179 for (Instruction &I : *Block) {
5180 if (I.mayReadOrWriteMemory()) {
5181 // TODO: This instruction may already have access group from
5182 // other pragmas e.g. #pragma clang loop vectorize. Append
5183 // so that the existing metadata is not overwritten.
5184 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
5185 }
5186 }
5187}
5188
5192 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5193 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
5194}
5195
5199 Loop, {
5200 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5201 });
5202}
5203
5204void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
5205 Value *IfCond, ValueToValueMapTy &VMap,
5206 const Twine &NamePrefix) {
5207 Function *F = CanonicalLoop->getFunction();
5208
5209 // Define where if branch should be inserted
5210 Instruction *SplitBefore;
5211 if (Instruction::classof(IfCond)) {
5212 SplitBefore = dyn_cast<Instruction>(IfCond);
5213 } else {
5214 SplitBefore = CanonicalLoop->getPreheader()->getTerminator();
5215 }
5216
5217 // TODO: We should not rely on pass manager. Currently we use pass manager
5218 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5219 // object. We should have a method which returns all blocks between
5220 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5222 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5223 FAM.registerPass([]() { return LoopAnalysis(); });
5224 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5225
5226 // Get the loop which needs to be cloned
5227 LoopAnalysis LIA;
5228 LoopInfo &&LI = LIA.run(*F, FAM);
5229 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5230
5231 // Create additional blocks for the if statement
5232 BasicBlock *Head = SplitBefore->getParent();
5233 Instruction *HeadOldTerm = Head->getTerminator();
5234 llvm::LLVMContext &C = Head->getContext();
5236 C, NamePrefix + ".if.then", Head->getParent(), Head->getNextNode());
5238 C, NamePrefix + ".if.else", Head->getParent(), CanonicalLoop->getExit());
5239
5240 // Create if condition branch.
5241 Builder.SetInsertPoint(HeadOldTerm);
5242 Instruction *BrInstr =
5243 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
5244 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
5245 // Then block contains branch to omp loop which needs to be vectorized
5246 spliceBB(IP, ThenBlock, false);
5247 ThenBlock->replaceSuccessorsPhiUsesWith(Head, ThenBlock);
5248
5249 Builder.SetInsertPoint(ElseBlock);
5250
5251 // Clone loop for the else branch
5253
5254 VMap[CanonicalLoop->getPreheader()] = ElseBlock;
5255 for (BasicBlock *Block : L->getBlocks()) {
5256 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
5257 NewBB->moveBefore(CanonicalLoop->getExit());
5258 VMap[Block] = NewBB;
5259 NewBlocks.push_back(NewBB);
5260 }
5261 remapInstructionsInBlocks(NewBlocks, VMap);
5262 Builder.CreateBr(NewBlocks.front());
5263}
5264
5265unsigned
5267 const StringMap<bool> &Features) {
5268 if (TargetTriple.isX86()) {
5269 if (Features.lookup("avx512f"))
5270 return 512;
5271 else if (Features.lookup("avx"))
5272 return 256;
5273 return 128;
5274 }
5275 if (TargetTriple.isPPC())
5276 return 128;
5277 if (TargetTriple.isWasm())
5278 return 128;
5279 return 0;
5280}
5281
5283 MapVector<Value *, Value *> AlignedVars,
5284 Value *IfCond, OrderKind Order,
5285 ConstantInt *Simdlen, ConstantInt *Safelen) {
5287
5288 Function *F = CanonicalLoop->getFunction();
5289
5290 // TODO: We should not rely on pass manager. Currently we use pass manager
5291 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5292 // object. We should have a method which returns all blocks between
5293 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5295 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5296 FAM.registerPass([]() { return LoopAnalysis(); });
5297 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5298
5299 LoopAnalysis LIA;
5300 LoopInfo &&LI = LIA.run(*F, FAM);
5301
5302 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5303 if (AlignedVars.size()) {
5305 for (auto &AlignedItem : AlignedVars) {
5306 Value *AlignedPtr = AlignedItem.first;
5307 Value *Alignment = AlignedItem.second;
5308 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
5309 Builder.SetInsertPoint(loadInst->getNextNode());
5310 Builder.CreateAlignmentAssumption(F->getDataLayout(),
5311 AlignedPtr, Alignment);
5312 }
5313 Builder.restoreIP(IP);
5314 }
5315
5316 if (IfCond) {
5317 ValueToValueMapTy VMap;
5318 createIfVersion(CanonicalLoop, IfCond, VMap, "simd");
5319 // Add metadata to the cloned loop which disables vectorization
5320 Value *MappedLatch = VMap.lookup(CanonicalLoop->getLatch());
5321 assert(MappedLatch &&
5322 "Cannot find value which corresponds to original loop latch");
5323 assert(isa<BasicBlock>(MappedLatch) &&
5324 "Cannot cast mapped latch block value to BasicBlock");
5325 BasicBlock *NewLatchBlock = dyn_cast<BasicBlock>(MappedLatch);
5326 ConstantAsMetadata *BoolConst =
5329 NewLatchBlock,
5330 {MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"),
5331 BoolConst})});
5332 }
5333
5334 SmallSet<BasicBlock *, 8> Reachable;
5335
5336 // Get the basic blocks from the loop in which memref instructions
5337 // can be found.
5338 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5339 // preferably without running any passes.
5340 for (BasicBlock *Block : L->getBlocks()) {
5341 if (Block == CanonicalLoop->getCond() ||
5342 Block == CanonicalLoop->getHeader())
5343 continue;
5344 Reachable.insert(Block);
5345 }
5346
5347 SmallVector<Metadata *> LoopMDList;
5348
5349 // In presence of finite 'safelen', it may be unsafe to mark all
5350 // the memory instructions parallel, because loop-carried
5351 // dependences of 'safelen' iterations are possible.
5352 // If clause order(concurrent) is specified then the memory instructions
5353 // are marked parallel even if 'safelen' is finite.
5354 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
5355 // Add access group metadata to memory-access instructions.
5356 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5357 for (BasicBlock *BB : Reachable)
5358 addSimdMetadata(BB, AccessGroup, LI);
5359 // TODO: If the loop has existing parallel access metadata, have
5360 // to combine two lists.
5361 LoopMDList.push_back(MDNode::get(
5362 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5363 }
5364
5365 // Use the above access group metadata to create loop level
5366 // metadata, which should be distinct for each loop.
5367 ConstantAsMetadata *BoolConst =
5369 LoopMDList.push_back(MDNode::get(
5370 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
5371
5372 if (Simdlen || Safelen) {
5373 // If both simdlen and safelen clauses are specified, the value of the
5374 // simdlen parameter must be less than or equal to the value of the safelen
5375 // parameter. Therefore, use safelen only in the absence of simdlen.
5376 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
5377 LoopMDList.push_back(
5378 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
5379 ConstantAsMetadata::get(VectorizeWidth)}));
5380 }
5381
5382 addLoopMetadata(CanonicalLoop, LoopMDList);
5383}
5384
5385/// Create the TargetMachine object to query the backend for optimization
5386/// preferences.
5387///
5388/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
5389/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
5390/// needed for the LLVM pass pipline. We use some default options to avoid
5391/// having to pass too many settings from the frontend that probably do not
5392/// matter.
5393///
5394/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
5395/// method. If we are going to use TargetMachine for more purposes, especially
5396/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
5397/// might become be worth requiring front-ends to pass on their TargetMachine,
5398/// or at least cache it between methods. Note that while fontends such as Clang
5399/// have just a single main TargetMachine per translation unit, "target-cpu" and
5400/// "target-features" that determine the TargetMachine are per-function and can
5401/// be overrided using __attribute__((target("OPTIONS"))).
5402static std::unique_ptr<TargetMachine>
5404 Module *M = F->getParent();
5405
5406 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
5407 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
5408 const std::string &Triple = M->getTargetTriple();
5409
5410 std::string Error;
5412 if (!TheTarget)
5413 return {};
5414
5416 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
5417 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
5418 /*CodeModel=*/std::nullopt, OptLevel));
5419}
5420
5421/// Heuristically determine the best-performant unroll factor for \p CLI. This
5422/// depends on the target processor. We are re-using the same heuristics as the
5423/// LoopUnrollPass.
5425 Function *F = CLI->getFunction();
5426
5427 // Assume the user requests the most aggressive unrolling, even if the rest of
5428 // the code is optimized using a lower setting.
5430 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
5431
5433 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
5434 FAM.registerPass([]() { return AssumptionAnalysis(); });
5435 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5436 FAM.registerPass([]() { return LoopAnalysis(); });
5437 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
5438 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5439 TargetIRAnalysis TIRA;
5440 if (TM)
5441 TIRA = TargetIRAnalysis(
5442 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
5443 FAM.registerPass([&]() { return TIRA; });
5444
5445 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
5447 ScalarEvolution &&SE = SEA.run(*F, FAM);
5449 DominatorTree &&DT = DTA.run(*F, FAM);
5450 LoopAnalysis LIA;
5451 LoopInfo &&LI = LIA.run(*F, FAM);
5453 AssumptionCache &&AC = ACT.run(*F, FAM);
5455
5456 Loop *L = LI.getLoopFor(CLI->getHeader());
5457 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
5458
5461 /*BlockFrequencyInfo=*/nullptr,
5462 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
5463 /*UserThreshold=*/std::nullopt,
5464 /*UserCount=*/std::nullopt,
5465 /*UserAllowPartial=*/true,
5466 /*UserAllowRuntime=*/true,
5467 /*UserUpperBound=*/std::nullopt,
5468 /*UserFullUnrollMaxCount=*/std::nullopt);
5469
5470 UP.Force = true;
5471
5472 // Account for additional optimizations taking place before the LoopUnrollPass
5473 // would unroll the loop.
5476
5477 // Use normal unroll factors even if the rest of the code is optimized for
5478 // size.
5481
5482 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
5483 << " Threshold=" << UP.Threshold << "\n"
5484 << " PartialThreshold=" << UP.PartialThreshold << "\n"
5485 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
5486 << " PartialOptSizeThreshold="
5487 << UP.PartialOptSizeThreshold << "\n");
5488
5489 // Disable peeling.
5492 /*UserAllowPeeling=*/false,
5493 /*UserAllowProfileBasedPeeling=*/false,
5494 /*UnrollingSpecficValues=*/false);
5495
5497 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
5498
5499 // Assume that reads and writes to stack variables can be eliminated by
5500 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
5501 // size.
5502 for (BasicBlock *BB : L->blocks()) {
5503 for (Instruction &I : *BB) {
5504 Value *Ptr;
5505 if (auto *Load = dyn_cast<LoadInst>(&I)) {
5506 Ptr = Load->getPointerOperand();
5507 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5508 Ptr = Store->getPointerOperand();
5509 } else
5510 continue;
5511
5512 Ptr = Ptr->stripPointerCasts();
5513
5514 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
5515 if (Alloca->getParent() == &F->getEntryBlock())
5516 EphValues.insert(&I);
5517 }
5518 }
5519 }
5520
5521 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
5522
5523 // Loop is not unrollable if the loop contains certain instructions.
5524 if (!UCE.canUnroll()) {
5525 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
5526 return 1;
5527 }
5528
5529 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
5530 << "\n");
5531
5532 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
5533 // be able to use it.
5534 int TripCount = 0;
5535 int MaxTripCount = 0;
5536 bool MaxOrZero = false;
5537 unsigned TripMultiple = 0;
5538
5539 bool UseUpperBound = false;
5540 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
5541 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
5542 UseUpperBound);
5543 unsigned Factor = UP.Count;
5544 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
5545
5546 // This function returns 1 to signal to not unroll a loop.
5547 if (Factor == 0)
5548 return 1;
5549 return Factor;
5550}
5551
5553 int32_t Factor,
5554 CanonicalLoopInfo **UnrolledCLI) {
5555 assert(Factor >= 0 && "Unroll factor must not be negative");
5556
5557 Function *F = Loop->getFunction();
5558 LLVMContext &Ctx = F->getContext();
5559
5560 // If the unrolled loop is not used for another loop-associated directive, it
5561 // is sufficient to add metadata for the LoopUnrollPass.
5562 if (!UnrolledCLI) {
5563 SmallVector<Metadata *, 2> LoopMetadata;
5564 LoopMetadata.push_back(
5565 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
5566
5567 if (Factor >= 1) {
5569 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
5570 LoopMetadata.push_back(MDNode::get(
5571 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
5572 }
5573
5574 addLoopMetadata(Loop, LoopMetadata);
5575 return;
5576 }
5577
5578 // Heuristically determine the unroll factor.
5579 if (Factor == 0)
5581
5582 // No change required with unroll factor 1.
5583 if (Factor == 1) {
5584 *UnrolledCLI = Loop;
5585 return;
5586 }
5587
5588 assert(Factor >= 2 &&
5589 "unrolling only makes sense with a factor of 2 or larger");
5590
5591 Type *IndVarTy = Loop->getIndVarType();
5592
5593 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
5594 // unroll the inner loop.
5595 Value *FactorVal =
5596 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
5597 /*isSigned=*/false));
5598 std::vector<CanonicalLoopInfo *> LoopNest =
5599 tileLoops(DL, {Loop}, {FactorVal});
5600 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
5601 *UnrolledCLI = LoopNest[0];
5602 CanonicalLoopInfo *InnerLoop = LoopNest[1];
5603
5604 // LoopUnrollPass can only fully unroll loops with constant trip count.
5605 // Unroll by the unroll factor with a fallback epilog for the remainder
5606 // iterations if necessary.
5608 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
5610 InnerLoop,
5611 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5613 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
5614
5615#ifndef NDEBUG
5616 (*UnrolledCLI)->assertOK();
5617#endif
5618}
5619
5622 llvm::Value *BufSize, llvm::Value *CpyBuf,
5623 llvm::Value *CpyFn, llvm::Value *DidIt) {
5624 if (!updateToLocation(Loc))
5625 return Loc.IP;
5626
5627 uint32_t SrcLocStrSize;
5628 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5629 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5630 Value *ThreadId = getOrCreateThreadID(Ident);
5631
5632 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
5633
5634 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
5635
5636 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
5637 Builder.CreateCall(Fn, Args);
5638
5639 return Builder.saveIP();
5640}
5641
5643 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5644 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
5646
5647 if (!updateToLocation(Loc))
5648 return Loc.IP;
5649
5650 // If needed allocate and initialize `DidIt` with 0.
5651 // DidIt: flag variable: 1=single thread; 0=not single thread.
5652 llvm::Value *DidIt = nullptr;
5653 if (!CPVars.empty()) {
5656 }
5657
5658 Directive OMPD = Directive::OMPD_single;
5659 uint32_t SrcLocStrSize;
5660 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5661 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5662 Value *ThreadId = getOrCreateThreadID(Ident);
5663 Value *Args[] = {Ident, ThreadId};
5664
5665 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
5666 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
5667
5668 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
5669 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5670
5671 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
5672 if (Error Err = FiniCB(IP))
5673 return Err;
5674
5675 // The thread that executes the single region must set `DidIt` to 1.
5676 // This is used by __kmpc_copyprivate, to know if the caller is the
5677 // single thread or not.
5678 if (DidIt)
5680
5681 return Error::success();
5682 };
5683
5684 // generates the following:
5685 // if (__kmpc_single()) {
5686 // .... single region ...
5687 // __kmpc_end_single
5688 // }
5689 // __kmpc_copyprivate
5690 // __kmpc_barrier
5691
5692 InsertPointOrErrorTy AfterIP =
5693 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
5694 /*Conditional*/ true,
5695 /*hasFinalize*/ true);
5696 if (!AfterIP)
5697 return AfterIP.takeError();
5698
5699 if (DidIt) {
5700 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
5701 // NOTE BufSize is currently unused, so just pass 0.
5703 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
5704 CPFuncs[I], DidIt);
5705 // NOTE __kmpc_copyprivate already inserts a barrier
5706 } else if (!IsNowait) {
5707 InsertPointOrErrorTy AfterIP =
5709 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
5710 /* CheckCancelFlag */ false);
5711 if (!AfterIP)
5712 return AfterIP.takeError();
5713 }
5714 return Builder.saveIP();
5715}
5716
5718 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5719 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
5720
5721 if (!updateToLocation(Loc))
5722 return Loc.IP;
5723
5724 Directive OMPD = Directive::OMPD_critical;
5725 uint32_t SrcLocStrSize;
5726 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5727 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5728 Value *ThreadId = getOrCreateThreadID(Ident);
5729 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
5730 Value *Args[] = {Ident, ThreadId, LockVar};
5731
5732 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
5733 Function *RTFn = nullptr;
5734 if (HintInst) {
5735 // Add Hint to entry Args and create call
5736 EnterArgs.push_back(HintInst);
5737 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
5738 } else {
5739 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
5740 }
5741 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
5742
5743 Function *ExitRTLFn =
5744 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
5745 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5746
5747 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5748 /*Conditional*/ false, /*hasFinalize*/ true);
5749}
5750
5753 InsertPointTy AllocaIP, unsigned NumLoops,
5754 ArrayRef<llvm::Value *> StoreValues,
5755 const Twine &Name, bool IsDependSource) {
5756 assert(
5757 llvm::all_of(StoreValues,
5758 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
5759 "OpenMP runtime requires depend vec with i64 type");
5760
5761 if (!updateToLocation(Loc))
5762 return Loc.IP;
5763
5764 // Allocate space for vector and generate alloc instruction.
5765 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
5766 Builder.restoreIP(AllocaIP);
5767 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
5768 ArgsBase->setAlignment(Align(8));
5769 Builder.restoreIP(Loc.IP);
5770
5771 // Store the index value with offset in depend vector.
5772 for (unsigned I = 0; I < NumLoops; ++I) {
5773 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
5774 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
5775 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
5776 STInst->setAlignment(Align(8));
5777 }
5778
5779 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
5780 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
5781
5782 uint32_t SrcLocStrSize;
5783 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5784 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5785 Value *ThreadId = getOrCreateThreadID(Ident);
5786 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
5787
5788 Function *RTLFn = nullptr;
5789 if (IsDependSource)
5790 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
5791 else
5792 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
5793 Builder.CreateCall(RTLFn, Args);
5794
5795 return Builder.saveIP();
5796}
5797
5799 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5800 FinalizeCallbackTy FiniCB, bool IsThreads) {
5801 if (!updateToLocation(Loc))
5802 return Loc.IP;
5803
5804 Directive OMPD = Directive::OMPD_ordered;
5805 Instruction *EntryCall = nullptr;
5806 Instruction *ExitCall = nullptr;
5807
5808 if (IsThreads) {
5809 uint32_t SrcLocStrSize;
5810 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5811 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5812 Value *ThreadId = getOrCreateThreadID(Ident);
5813 Value *Args[] = {Ident, ThreadId};
5814
5815 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
5816 EntryCall = Builder.CreateCall(EntryRTLFn, Args);
5817
5818 Function *ExitRTLFn =
5819 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
5820 ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5821 }
5822
5823 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5824 /*Conditional*/ false, /*hasFinalize*/ true);
5825}
5826
5827OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
5828 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
5829 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
5830 bool HasFinalize, bool IsCancellable) {
5831
5832 if (HasFinalize)
5833 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
5834
5835 // Create inlined region's entry and body blocks, in preparation
5836 // for conditional creation
5837 BasicBlock *EntryBB = Builder.GetInsertBlock();
5838 Instruction *SplitPos = EntryBB->getTerminator();
5839 if (!isa_and_nonnull<BranchInst>(SplitPos))
5840 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
5841 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
5842 BasicBlock *FiniBB =
5843 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
5844
5846 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
5847
5848 // generate body
5849 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
5850 /* CodeGenIP */ Builder.saveIP()))
5851 return Err;
5852
5853 // emit exit call and do any needed finalization.
5854 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
5855 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
5856 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
5857 "Unexpected control flow graph state!!");
5858 InsertPointOrErrorTy AfterIP =
5859 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
5860 if (!AfterIP)
5861 return AfterIP.takeError();
5862 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
5863 "Unexpected Control Flow State!");
5865
5866 // If we are skipping the region of a non conditional, remove the exit
5867 // block, and clear the builder's insertion point.
5868 assert(SplitPos->getParent() == ExitBB &&
5869 "Unexpected Insertion point location!");
5870 auto merged = MergeBlockIntoPredecessor(ExitBB);
5871 BasicBlock *ExitPredBB = SplitPos->getParent();
5872 auto InsertBB = merged ? ExitPredBB : ExitBB;
5873 if (!isa_and_nonnull<BranchInst>(SplitPos))
5874 SplitPos->eraseFromParent();
5875 Builder.SetInsertPoint(InsertBB);
5876
5877 return Builder.saveIP();
5878}
5879
5880OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
5881 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
5882 // if nothing to do, Return current insertion point.
5883 if (!Conditional || !EntryCall)
5884 return Builder.saveIP();
5885
5886 BasicBlock *EntryBB = Builder.GetInsertBlock();
5887 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
5888 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
5889 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
5890
5891 // Emit thenBB and set the Builder's insertion point there for
5892 // body generation next. Place the block after the current block.
5893 Function *CurFn = EntryBB->getParent();
5894 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
5895
5896 // Move Entry branch to end of ThenBB, and replace with conditional
5897 // branch (If-stmt)
5898 Instruction *EntryBBTI = EntryBB->getTerminator();
5899 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
5900 EntryBBTI->removeFromParent();
5902 Builder.Insert(EntryBBTI);
5903 UI->eraseFromParent();
5905
5906 // return an insertion point to ExitBB.
5907 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
5908}
5909
5910OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
5911 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
5912 bool HasFinalize) {
5913
5914 Builder.restoreIP(FinIP);
5915
5916 // If there is finalization to do, emit it before the exit call
5917 if (HasFinalize) {
5918 assert(!FinalizationStack.empty() &&
5919 "Unexpected finalization stack state!");
5920
5921 FinalizationInfo Fi = FinalizationStack.pop_back_val();
5922 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
5923
5924 if (Error Err = Fi.FiniCB(FinIP))
5925 return Err;
5926
5927 BasicBlock *FiniBB = FinIP.getBlock();
5928 Instruction *FiniBBTI = FiniBB->getTerminator();
5929
5930 // set Builder IP for call creation
5931 Builder.SetInsertPoint(FiniBBTI);
5932 }
5933
5934 if (!ExitCall)
5935 return Builder.saveIP();
5936
5937 // place the Exitcall as last instruction before Finalization block terminator
5938 ExitCall->removeFromParent();
5939 Builder.Insert(ExitCall);
5940
5941 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
5942 ExitCall->getIterator());
5943}
5944
5946 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
5947 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
5948 if (!IP.isSet())
5949 return IP;
5950
5952
5953 // creates the following CFG structure
5954 // OMP_Entry : (MasterAddr != PrivateAddr)?
5955 // F T
5956 // | \
5957 // | copin.not.master
5958 // | /
5959 // v /
5960 // copyin.not.master.end
5961 // |
5962 // v
5963 // OMP.Entry.Next
5964
5965 BasicBlock *OMP_Entry = IP.getBlock();
5966 Function *CurFn = OMP_Entry->getParent();
5967 BasicBlock *CopyBegin =
5968 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
5969 BasicBlock *CopyEnd = nullptr;
5970
5971 // If entry block is terminated, split to preserve the branch to following
5972 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
5973 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
5974 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
5975 "copyin.not.master.end");
5976 OMP_Entry->getTerminator()->eraseFromParent();
5977 } else {
5978 CopyEnd =
5979 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
5980 }
5981
5982 Builder.SetInsertPoint(OMP_Entry);
5983 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
5984 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
5985 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
5986 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
5987
5988 Builder.SetInsertPoint(CopyBegin);
5989 if (BranchtoEnd)
5991
5992 return Builder.saveIP();
5993}
5994
5996 Value *Size, Value *Allocator,
5997 std::string Name) {
5999 updateToLocation(Loc);
6000
6001 uint32_t SrcLocStrSize;
6002 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6003 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6004 Value *ThreadId = getOrCreateThreadID(Ident);
6005 Value *Args[] = {ThreadId, Size, Allocator};
6006
6007 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
6008
6009 return Builder.CreateCall(Fn, Args, Name);
6010}
6011
6013 Value *Addr, Value *Allocator,
6014 std::string Name) {
6016 updateToLocation(Loc);
6017
6018 uint32_t SrcLocStrSize;
6019 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6020 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6021 Value *ThreadId = getOrCreateThreadID(Ident);
6022 Value *Args[] = {ThreadId, Addr, Allocator};
6023 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
6024 return Builder.CreateCall(Fn, Args, Name);
6025}
6026
6028 const LocationDescription &Loc, Value *InteropVar,
6029 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
6030 Value *DependenceAddress, bool HaveNowaitClause) {
6032 updateToLocation(Loc);
6033
6034 uint32_t SrcLocStrSize;
6035 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6036 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6037 Value *ThreadId = getOrCreateThreadID(Ident);
6038 if (Device == nullptr)
6040 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
6041 if (NumDependences == nullptr) {
6042 NumDependences = ConstantInt::get(Int32, 0);
6043 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6044 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6045 }
6046 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6047 Value *Args[] = {
6048 Ident, ThreadId, InteropVar, InteropTypeVal,
6049 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
6050
6051 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
6052
6053 return Builder.CreateCall(Fn, Args);
6054}
6055
6057 const LocationDescription &Loc, Value *InteropVar, Value *Device,
6058 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
6060 updateToLocation(Loc);
6061
6062 uint32_t SrcLocStrSize;
6063 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6064 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6065 Value *ThreadId = getOrCreateThreadID(Ident);
6066 if (Device == nullptr)
6068 if (NumDependences == nullptr) {
6069 NumDependences = ConstantInt::get(Int32, 0);
6070 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6071 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6072 }
6073 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6074 Value *Args[] = {
6075 Ident, ThreadId, InteropVar, Device,
6076 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6077
6078 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
6079
6080 return Builder.CreateCall(Fn, Args);
6081}
6082
6084 Value *InteropVar, Value *Device,
6085 Value *NumDependences,
6086 Value *DependenceAddress,
6087 bool HaveNowaitClause) {
6089 updateToLocation(Loc);
6090 uint32_t SrcLocStrSize;
6091 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6092 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6093 Value *ThreadId = getOrCreateThreadID(Ident);
6094 if (Device == nullptr)
6096 if (NumDependences == nullptr) {
6097 NumDependences = ConstantInt::get(Int32, 0);
6098 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6099 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6100 }
6101 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6102 Value *Args[] = {
6103 Ident, ThreadId, InteropVar, Device,
6104 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6105
6106 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
6107
6108 return Builder.CreateCall(Fn, Args);
6109}
6110
6112 const LocationDescription &Loc, llvm::Value *Pointer,
6115 updateToLocation(Loc);
6116
6117 uint32_t SrcLocStrSize;
6118 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6119 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6120 Value *ThreadId = getOrCreateThreadID(Ident);
6121 Constant *ThreadPrivateCache =
6122 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
6123 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
6124
6125 Function *Fn =
6126 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
6127
6128 return Builder.CreateCall(Fn, Args);
6129}
6130
6133 int32_t MinThreadsVal, int32_t MaxThreadsVal,
6134 int32_t MinTeamsVal, int32_t MaxTeamsVal) {
6135 if (!updateToLocation(Loc))
6136 return Loc.IP;
6137
6138 uint32_t SrcLocStrSize;
6139 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6140 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6141 Constant *IsSPMDVal = ConstantInt::getSigned(
6143 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(Int8, !IsSPMD);
6144 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
6145 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
6146
6147 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
6148 Function *Kernel = DebugKernelWrapper;
6149
6150 // We need to strip the debug prefix to get the correct kernel name.
6151 StringRef KernelName = Kernel->getName();
6152 const std::string DebugPrefix = "_debug__";
6153 if (KernelName.ends_with(DebugPrefix)) {
6154 KernelName = KernelName.drop_back(DebugPrefix.length());
6155 Kernel = M.getFunction(KernelName);
6156 assert(Kernel && "Expected the real kernel to exist");
6157 }
6158
6159 // Manifest the launch configuration in the metadata matching the kernel
6160 // environment.
6161 if (MinTeamsVal > 1 || MaxTeamsVal > 0)
6162 writeTeamsForKernel(T, *Kernel, MinTeamsVal, MaxTeamsVal);
6163
6164 // For max values, < 0 means unset, == 0 means set but unknown.
6165 if (MaxThreadsVal < 0)
6166 MaxThreadsVal = std::max(
6167 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), MinThreadsVal);
6168
6169 if (MaxThreadsVal > 0)
6170 writeThreadBoundsForKernel(T, *Kernel, MinThreadsVal, MaxThreadsVal);
6171
6172 Constant *MinThreads = ConstantInt::getSigned(Int32, MinThreadsVal);
6174 Constant *MinTeams = ConstantInt::getSigned(Int32, MinTeamsVal);
6175 Constant *MaxTeams = ConstantInt::getSigned(Int32, MaxTeamsVal);
6176 Constant *ReductionDataSize = ConstantInt::getSigned(Int32, 0);
6177 Constant *ReductionBufferLength = ConstantInt::getSigned(Int32, 0);
6178
6180 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
6181 const DataLayout &DL = Fn->getDataLayout();
6182
6183 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
6184 Constant *DynamicEnvironmentInitializer =
6185 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
6186 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
6187 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
6188 DynamicEnvironmentInitializer, DynamicEnvironmentName,
6189 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6190 DL.getDefaultGlobalsAddressSpace());
6191 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6192
6193 Constant *DynamicEnvironment =
6194 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
6195 ? DynamicEnvironmentGV
6196 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
6197 DynamicEnvironmentPtr);
6198
6199 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
6200 ConfigurationEnvironment, {
6201 UseGenericStateMachineVal,
6202 MayUseNestedParallelismVal,
6203 IsSPMDVal,
6204 MinThreads,
6205 MaxThreads,
6206 MinTeams,
6207 MaxTeams,
6208 ReductionDataSize,
6209 ReductionBufferLength,
6210 });
6211 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
6212 KernelEnvironment, {
6213 ConfigurationEnvironmentInitializer,
6214 Ident,
6215 DynamicEnvironment,
6216 });
6217 std::string KernelEnvironmentName =
6218 (KernelName + "_kernel_environment").str();
6219 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
6220 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
6221 KernelEnvironmentInitializer, KernelEnvironmentName,
6222 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6223 DL.getDefaultGlobalsAddressSpace());
6224 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6225
6226 Constant *KernelEnvironment =
6227 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
6228 ? KernelEnvironmentGV
6229 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
6230 KernelEnvironmentPtr);
6231 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
6232 CallInst *ThreadKind =
6233 Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
6234
6235 Value *ExecUserCode = Builder.CreateICmpEQ(
6236 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
6237 "exec_user_code");
6238
6239 // ThreadKind = __kmpc_target_init(...)
6240 // if (ThreadKind == -1)
6241 // user_code
6242 // else
6243 // return;
6244
6245 auto *UI = Builder.CreateUnreachable();
6246 BasicBlock *CheckBB = UI->getParent();
6247 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
6248
6249 BasicBlock *WorkerExitBB = BasicBlock::Create(
6250 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
6251 Builder.SetInsertPoint(WorkerExitBB);
6253
6254 auto *CheckBBTI = CheckBB->getTerminator();
6255 Builder.SetInsertPoint(CheckBBTI);
6256 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
6257
6258 CheckBBTI->eraseFromParent();
6259 UI->eraseFromParent();
6260
6261 // Continue in the "user_code" block, see diagram above and in
6262 // openmp/libomptarget/deviceRTLs/common/include/target.h .
6263 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
6264}
6265
6267 int32_t TeamsReductionDataSize,
6268 int32_t TeamsReductionBufferLength) {
6269 if (!updateToLocation(Loc))
6270 return;
6271
6273 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
6274
6275 Builder.CreateCall(Fn, {});
6276
6277 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
6278 return;
6279
6281 // We need to strip the debug prefix to get the correct kernel name.
6282 StringRef KernelName = Kernel->getName();
6283 const std::string DebugPrefix = "_debug__";
6284 if (KernelName.ends_with(DebugPrefix))
6285 KernelName = KernelName.drop_back(DebugPrefix.length());
6286 auto *KernelEnvironmentGV =
6287 M.getNamedGlobal((KernelName + "_kernel_environment").str());
6288 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
6289 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
6290 auto *NewInitializer = ConstantFoldInsertValueInstruction(
6291 KernelEnvironmentInitializer,
6292 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
6293 NewInitializer = ConstantFoldInsertValueInstruction(
6294 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
6295 {0, 8});
6296 KernelEnvironmentGV->setInitializer(NewInitializer);
6297}
6298
6300 Module &M = *Kernel.getParent();
6301 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6302 for (auto *Op : MD->operands()) {
6303 if (Op->getNumOperands() != 3)
6304 continue;
6305 auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
6306 if (!KernelOp || KernelOp->getValue() != &Kernel)
6307 continue;
6308 auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
6309 if (!Prop || Prop->getString() != Name)
6310 continue;
6311 return Op;
6312 }
6313 return nullptr;
6314}
6315
6317 bool Min) {
6318 // Update the "maxntidx" metadata for NVIDIA, or add it.
6319 MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name);
6320 if (ExistingOp) {
6321 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
6322 int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
6323 ExistingOp->replaceOperandWith(
6324 2, ConstantAsMetadata::get(ConstantInt::get(
6325 OldVal->getValue()->getType(),
6326 Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value))));
6327 } else {
6328 LLVMContext &Ctx = Kernel.getContext();
6330 MDString::get(Ctx, Name),
6332 ConstantInt::get(Type::getInt32Ty(Ctx), Value))};
6333 // Append metadata to nvvm.annotations
6334 Module &M = *Kernel.getParent();
6335 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6336 MD->addOperand(MDNode::get(Ctx, MDVals));
6337 }
6338}
6339
6340std::pair<int32_t, int32_t>
6342 int32_t ThreadLimit =
6343 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
6344
6345 if (T.isAMDGPU()) {
6346 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
6347 if (!Attr.isValid() || !Attr.isStringAttribute())
6348 return {0, ThreadLimit};
6349 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
6350 int32_t LB, UB;
6351 if (!llvm::to_integer(UBStr, UB, 10))
6352 return {0, ThreadLimit};
6353 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
6354 if (!llvm::to_integer(LBStr, LB, 10))
6355 return {0, UB};
6356 return {LB, UB};
6357 }
6358
6359 if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) {
6360 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
6361 int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
6362 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
6363 }
6364 return {0, ThreadLimit};
6365}
6366
6368 Function &Kernel, int32_t LB,
6369 int32_t UB) {
6370 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
6371
6372 if (T.isAMDGPU()) {
6373 Kernel.addFnAttr("amdgpu-flat-work-group-size",
6374 llvm::utostr(LB) + "," + llvm::utostr(UB));
6375 return;
6376 }
6377
6378 updateNVPTXMetadata(Kernel, "maxntidx", UB, true);
6379}
6380
6381std::pair<int32_t, int32_t>
6383 // TODO: Read from backend annotations if available.
6384 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
6385}
6386
6388 int32_t LB, int32_t UB) {
6389 if (T.isNVPTX())
6390 if (UB > 0)
6391 updateNVPTXMetadata(Kernel, "maxclusterrank", UB, true);
6392 if (T.isAMDGPU())
6393 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
6394
6395 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
6396}
6397
6398void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
6399 Function *OutlinedFn) {
6400 if (Config.isTargetDevice()) {
6402 // TODO: Determine if DSO local can be set to true.
6403 OutlinedFn->setDSOLocal(false);
6405 if (T.isAMDGCN())
6407 }
6408}
6409
6410Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
6411 StringRef EntryFnIDName) {
6412 if (Config.isTargetDevice()) {
6413 assert(OutlinedFn && "The outlined function must exist if embedded");
6414 return OutlinedFn;
6415 }
6416
6417 return new GlobalVariable(
6418 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
6419 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
6420}
6421
6422Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
6423 StringRef EntryFnName) {
6424 if (OutlinedFn)
6425 return OutlinedFn;
6426
6427 assert(!M.getGlobalVariable(EntryFnName, true) &&
6428 "Named kernel already exists?");
6429 return new GlobalVariable(
6430 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
6431 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
6432}
6433
6435 TargetRegionEntryInfo &EntryInfo,
6436 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
6437 Function *&OutlinedFn, Constant *&OutlinedFnID) {
6438
6439 SmallString<64> EntryFnName;
6440 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
6441
6443 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
6444 if (!CBResult)
6445 return CBResult.takeError();
6446 OutlinedFn = *CBResult;
6447 } else {
6448 OutlinedFn = nullptr;
6449 }
6450
6451 // If this target outline function is not an offload entry, we don't need to
6452 // register it. This may be in the case of a false if clause, or if there are
6453 // no OpenMP targets.
6454 if (!IsOffloadEntry)
6455 return Error::success();
6456
6457 std::string EntryFnIDName =
6459 ? std::string(EntryFnName)
6460 : createPlatformSpecificName({EntryFnName, "region_id"});
6461
6462 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
6463 EntryFnName, EntryFnIDName);
6464 return Error::success();
6465}
6466
6468 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
6469 StringRef EntryFnName, StringRef EntryFnIDName) {
6470 if (OutlinedFn)
6471 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
6472 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
6473 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
6475 EntryInfo, EntryAddr, OutlinedFnID,
6477 return OutlinedFnID;
6478}
6479
6481 const LocationDescription &Loc, InsertPointTy AllocaIP,
6482 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
6483 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
6484 omp::RuntimeFunction *MapperFunc,
6486 BodyGenTy BodyGenType)>
6487 BodyGenCB,
6488 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
6489 function_ref<Value *(unsigned int)> CustomMapperCB, Value *SrcLocInfo) {
6490 if (!updateToLocation(Loc))
6491 return InsertPointTy();
6492
6493 Builder.restoreIP(CodeGenIP);
6494 // Disable TargetData CodeGen on Device pass.
6495 if (Config.IsTargetDevice.value_or(false)) {
6496 if (BodyGenCB) {
6497 InsertPointOrErrorTy AfterIP =
6498 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
6499 if (!AfterIP)
6500 return AfterIP.takeError();
6501 Builder.restoreIP(*AfterIP);
6502 }
6503 return Builder.saveIP();
6504 }
6505
6506 bool IsStandAlone = !BodyGenCB;
6507 MapInfosTy *MapInfo;
6508 // Generate the code for the opening of the data environment. Capture all the
6509 // arguments of the runtime call by reference because they are used in the
6510 // closing of the region.
6511 auto BeginThenGen = [&](InsertPointTy AllocaIP,
6512 InsertPointTy CodeGenIP) -> Error {
6513 MapInfo = &GenMapInfoCB(Builder.saveIP());
6514 emitOffloadingArrays(AllocaIP, Builder.saveIP(), *MapInfo, Info,
6515 /*IsNonContiguous=*/true, DeviceAddrCB,
6516 CustomMapperCB);
6517
6518 TargetDataRTArgs RTArgs;
6520
6521 // Emit the number of elements in the offloading arrays.
6522 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
6523
6524 // Source location for the ident struct
6525 if (!SrcLocInfo) {
6526 uint32_t SrcLocStrSize;
6527 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6528 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6529 }
6530
6531 SmallVector<llvm::Value *, 13> OffloadingArgs = {
6532 SrcLocInfo, DeviceID,
6533 PointerNum, RTArgs.BasePointersArray,
6534 RTArgs.PointersArray, RTArgs.SizesArray,
6535 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
6536 RTArgs.MappersArray};
6537
6538 if (IsStandAlone) {
6539 assert(MapperFunc && "MapperFunc missing for standalone target data");
6540
6541 auto TaskBodyCB = [&](Value *, Value *,
6543 if (Info.HasNoWait) {
6544 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
6548 }
6549
6551 OffloadingArgs);
6552
6553 if (Info.HasNoWait) {
6554 BasicBlock *OffloadContBlock =
6555 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
6557 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
6559 }
6560 return Error::success();
6561 };
6562
6563 bool RequiresOuterTargetTask = Info.HasNoWait;
6564 if (!RequiresOuterTargetTask) {
6565 Error Err = TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
6566 /*TargetTaskAllocaIP=*/{});
6567 assert(!Err && "TaskBodyCB expected to succeed");
6568 } else {
6569 InsertPointOrErrorTy AfterIP =
6570 emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
6571 /*Dependencies=*/{}, Info.HasNoWait);
6572 assert(AfterIP && "TaskBodyCB expected to succeed");
6573 }
6574 } else {
6575 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
6576 omp::OMPRTL___tgt_target_data_begin_mapper);
6577
6578 Builder.CreateCall(BeginMapperFunc, OffloadingArgs);
6579
6580 for (auto DeviceMap : Info.DevicePtrInfoMap) {
6581 if (isa<AllocaInst>(DeviceMap.second.second)) {
6582 auto *LI =
6583 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
6584 Builder.CreateStore(LI, DeviceMap.second.second);
6585 }
6586 }
6587
6588 // If device pointer privatization is required, emit the body of the
6589 // region here. It will have to be duplicated: with and without
6590 // privatization.
6591 InsertPointOrErrorTy AfterIP =
6592 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
6593 if (!AfterIP)
6594 return AfterIP.takeError();
6595 Builder.restoreIP(*AfterIP);
6596 }
6597 return Error::success();
6598 };
6599
6600 // If we need device pointer privatization, we need to emit the body of the
6601 // region with no privatization in the 'else' branch of the conditional.
6602 // Otherwise, we don't have to do anything.
6603 auto BeginElseGen = [&](InsertPointTy AllocaIP,
6604 InsertPointTy CodeGenIP) -> Error {
6605 InsertPointOrErrorTy AfterIP =
6606 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
6607 if (!AfterIP)
6608 return AfterIP.takeError();
6609 Builder.restoreIP(*AfterIP);
6610 return Error::success();
6611 };
6612
6613 // Generate code for the closing of the data region.
6614 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6615 TargetDataRTArgs RTArgs;
6616 Info.EmitDebug = !MapInfo->Names.empty();
6617 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
6618
6619 // Emit the number of elements in the offloading arrays.
6620 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
6621
6622 // Source location for the ident struct
6623 if (!SrcLocInfo) {
6624 uint32_t SrcLocStrSize;
6625 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6626 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6627 }
6628
6629 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
6630 PointerNum, RTArgs.BasePointersArray,
6631 RTArgs.PointersArray, RTArgs.SizesArray,
6632 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
6633 RTArgs.MappersArray};
6634 Function *EndMapperFunc =
6635 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
6636
6637 Builder.CreateCall(EndMapperFunc, OffloadingArgs);
6638 return Error::success();
6639 };
6640
6641 // We don't have to do anything to close the region if the if clause evaluates
6642 // to false.
6643 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6644 return Error::success();
6645 };
6646
6647 Error Err = [&]() -> Error {
6648 if (BodyGenCB) {
6649 Error Err = [&]() {
6650 if (IfCond)
6651 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
6652 return BeginThenGen(AllocaIP, Builder.saveIP());
6653 }();
6654
6655 if (Err)
6656 return Err;
6657
6658 // If we don't require privatization of device pointers, we emit the body
6659 // in between the runtime calls. This avoids duplicating the body code.
6660 InsertPointOrErrorTy AfterIP =
6661 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
6662 if (!AfterIP)
6663 return AfterIP.takeError();
6664 Builder.restoreIP(*AfterIP);
6665
6666 if (IfCond)
6667 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
6668 return EndThenGen(AllocaIP, Builder.saveIP());
6669 }
6670 if (IfCond)
6671 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
6672 return BeginThenGen(AllocaIP, Builder.saveIP());
6673 }();
6674
6675 if (Err)
6676 return Err;
6677
6678 return Builder.saveIP();
6679}
6680
6683 bool IsGPUDistribute) {
6684 assert((IVSize == 32 || IVSize == 64) &&
6685 "IV size is not compatible with the omp runtime");
6687 if (IsGPUDistribute)
6688 Name = IVSize == 32
6689 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
6690 : omp::OMPRTL___kmpc_distribute_static_init_4u)
6691 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
6692 : omp::OMPRTL___kmpc_distribute_static_init_8u);
6693 else
6694 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
6695 : omp::OMPRTL___kmpc_for_static_init_4u)
6696 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
6697 : omp::OMPRTL___kmpc_for_static_init_8u);
6698
6700}
6701
6703 bool IVSigned) {
6704 assert((IVSize == 32 || IVSize == 64) &&
6705 "IV size is not compatible with the omp runtime");
6706 RuntimeFunction Name = IVSize == 32
6707 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
6708 : omp::OMPRTL___kmpc_dispatch_init_4u)
6709 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
6710 : omp::OMPRTL___kmpc_dispatch_init_8u);
6711
6713}
6714
6716 bool IVSigned) {
6717 assert((IVSize == 32 || IVSize == 64) &&
6718 "IV size is not compatible with the omp runtime");
6719 RuntimeFunction Name = IVSize == 32
6720 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
6721 : omp::OMPRTL___kmpc_dispatch_next_4u)
6722 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
6723 : omp::OMPRTL___kmpc_dispatch_next_8u);
6724
6726}
6727
6729 bool IVSigned) {
6730 assert((IVSize == 32 || IVSize == 64) &&
6731 "IV size is not compatible with the omp runtime");
6732 RuntimeFunction Name = IVSize == 32
6733 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
6734 : omp::OMPRTL___kmpc_dispatch_fini_4u)
6735 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
6736 : omp::OMPRTL___kmpc_dispatch_fini_8u);
6737
6739}
6740
6742 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
6743}
6744
6746 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, StringRef FuncName,
6750 SmallVector<Type *> ParameterTypes;
6751 if (OMPBuilder.Config.isTargetDevice()) {
6752 // Add the "implicit" runtime argument we use to provide launch specific
6753 // information for target devices.
6754 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
6755 ParameterTypes.push_back(Int8PtrTy);
6756
6757 // All parameters to target devices are passed as pointers
6758 // or i64. This assumes 64-bit address spaces/pointers.
6759 for (auto &Arg : Inputs)
6760 ParameterTypes.push_back(Arg->getType()->isPointerTy()
6761 ? Arg->getType()
6762 : Type::getInt64Ty(Builder.getContext()));
6763 } else {
6764 for (auto &Arg : Inputs)
6765 ParameterTypes.push_back(Arg->getType());
6766 }
6767
6768 auto BB = Builder.GetInsertBlock();
6769 auto M = BB->getModule();
6770 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
6771 /*isVarArg*/ false);
6772 auto Func =
6773 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
6774
6775 // Save insert point.
6776 IRBuilder<>::InsertPointGuard IPG(Builder);
6777 // If there's a DISubprogram associated with current function, then
6778 // generate one for the outlined function.
6779 if (Function *ParentFunc = BB->getParent()) {
6780 if (DISubprogram *SP = ParentFunc->getSubprogram()) {
6781 DICompileUnit *CU = SP->getUnit();
6782 DIBuilder DB(*M, true, CU);
6784 if (DL) {
6785 // TODO: We are using nullopt for arguments at the moment. This will
6786 // need to be updated when debug data is being generated for variables.
6787 DISubroutineType *Ty =
6788 DB.createSubroutineType(DB.getOrCreateTypeArray({}));
6789 DISubprogram::DISPFlags SPFlags = DISubprogram::SPFlagDefinition |
6790 DISubprogram::SPFlagOptimized |
6791 DISubprogram::SPFlagLocalToUnit;
6792
6793 DISubprogram *OutlinedSP = DB.createFunction(
6794 CU, FuncName, FuncName, SP->getFile(), DL.getLine(), Ty,
6795 DL.getLine(), DINode::DIFlags::FlagArtificial, SPFlags);
6796
6797 // Attach subprogram to the function.
6798 Func->setSubprogram(OutlinedSP);
6799 // Update the CurrentDebugLocation in the builder so that right scope
6800 // is used for things inside outlined function.
6802 DILocation::get(Func->getContext(), DL.getLine(), DL.getCol(),
6803 OutlinedSP, DL.getInlinedAt()));
6804 }
6805 }
6806 }
6807
6808 // Generate the region into the function.
6809 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
6810 Builder.SetInsertPoint(EntryBB);
6811
6812 // Insert target init call in the device compilation pass.
6813 if (OMPBuilder.Config.isTargetDevice())
6814 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, /*IsSPMD*/ false));
6815
6816 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
6817
6818 // As we embed the user code in the middle of our target region after we
6819 // generate entry code, we must move what allocas we can into the entry
6820 // block to avoid possible breaking optimisations for device
6821 if (OMPBuilder.Config.isTargetDevice())
6823
6824 // Insert target deinit call in the device compilation pass.
6825 BasicBlock *OutlinedBodyBB =
6826 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
6828 Builder.saveIP(),
6829 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
6830 if (!AfterIP)
6831 return AfterIP.takeError();
6832 Builder.restoreIP(*AfterIP);
6833 if (OMPBuilder.Config.isTargetDevice())
6834 OMPBuilder.createTargetDeinit(Builder);
6835
6836 // Insert return instruction.
6837 Builder.CreateRetVoid();
6838
6839 // New Alloca IP at entry point of created device function.
6840 Builder.SetInsertPoint(EntryBB->getFirstNonPHI());
6841 auto AllocaIP = Builder.saveIP();
6842
6843 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
6844
6845 // Skip the artificial dyn_ptr on the device.
6846 const auto &ArgRange =
6847 OMPBuilder.Config.isTargetDevice()
6848 ? make_range(Func->arg_begin() + 1, Func->arg_end())
6849 : Func->args();
6850
6851 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
6852 // Things like GEP's can come in the form of Constants. Constants and
6853 // ConstantExpr's do not have access to the knowledge of what they're
6854 // contained in, so we must dig a little to find an instruction so we
6855 // can tell if they're used inside of the function we're outlining. We
6856 // also replace the original constant expression with a new instruction
6857 // equivalent; an instruction as it allows easy modification in the
6858 // following loop, as we can now know the constant (instruction) is
6859 // owned by our target function and replaceUsesOfWith can now be invoked
6860 // on it (cannot do this with constants it seems). A brand new one also
6861 // allows us to be cautious as it is perhaps possible the old expression
6862 // was used inside of the function but exists and is used externally
6863 // (unlikely by the nature of a Constant, but still).
6864 // NOTE: We cannot remove dead constants that have been rewritten to
6865 // instructions at this stage, we run the risk of breaking later lowering
6866 // by doing so as we could still be in the process of lowering the module
6867 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
6868 // constants we have created rewritten versions of.
6869 if (auto *Const = dyn_cast<Constant>(Input))
6870 convertUsersOfConstantsToInstructions(Const, Func, false);
6871
6872 // Collect all the instructions
6873 for (User *User : make_early_inc_range(Input->users()))
6874 if (auto *Instr = dyn_cast<Instruction>(User))
6875 if (Instr->getFunction() == Func)
6876 Instr->replaceUsesOfWith(Input, InputCopy);
6877 };
6878
6879 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
6880
6881 // Rewrite uses of input valus to parameters.
6882 for (auto InArg : zip(Inputs, ArgRange)) {
6883 Value *Input = std::get<0>(InArg);
6884 Argument &Arg = std::get<1>(InArg);
6885 Value *InputCopy = nullptr;
6886
6888 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
6889 if (!AfterIP)
6890 return AfterIP.takeError();
6891 Builder.restoreIP(*AfterIP);
6892
6893 // In certain cases a Global may be set up for replacement, however, this
6894 // Global may be used in multiple arguments to the kernel, just segmented
6895 // apart, for example, if we have a global array, that is sectioned into
6896 // multiple mappings (technically not legal in OpenMP, but there is a case
6897 // in Fortran for Common Blocks where this is neccesary), we will end up
6898 // with GEP's into this array inside the kernel, that refer to the Global
6899 // but are technically seperate arguments to the kernel for all intents and
6900 // purposes. If we have mapped a segment that requires a GEP into the 0-th
6901 // index, it will fold into an referal to the Global, if we then encounter
6902 // this folded GEP during replacement all of the references to the
6903 // Global in the kernel will be replaced with the argument we have generated
6904 // that corresponds to it, including any other GEP's that refer to the
6905 // Global that may be other arguments. This will invalidate all of the other
6906 // preceding mapped arguments that refer to the same global that may be
6907 // seperate segments. To prevent this, we defer global processing until all
6908 // other processing has been performed.
6909 if (llvm::isa<llvm::GlobalValue>(std::get<0>(InArg)) ||
6910 llvm::isa<llvm::GlobalObject>(std::get<0>(InArg)) ||
6911 llvm::isa<llvm::GlobalVariable>(std::get<0>(InArg))) {
6912 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
6913 continue;
6914 }
6915
6916 ReplaceValue(Input, InputCopy, Func);
6917 }
6918
6919 // Replace all of our deferred Input values, currently just Globals.
6920 for (auto Deferred : DeferredReplacement)
6921 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
6922
6923 return Func;
6924}
6925
6926/// Create an entry point for a target task with the following.
6927/// It'll have the following signature
6928/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
6929/// This function is called from emitTargetTask once the
6930/// code to launch the target kernel has been outlined already.
6932 IRBuilderBase &Builder,
6933 CallInst *StaleCI) {
6934 Module &M = OMPBuilder.M;
6935 // KernelLaunchFunction is the target launch function, i.e.
6936 // the function that sets up kernel arguments and calls
6937 // __tgt_target_kernel to launch the kernel on the device.
6938 //
6939 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
6940
6941 // StaleCI is the CallInst which is the call to the outlined
6942 // target kernel launch function. If there are values that the
6943 // outlined function uses then these are aggregated into a structure
6944 // which is passed as the second argument. If not, then there's
6945 // only one argument, the threadID. So, StaleCI can be
6946 //
6947 // %structArg = alloca { ptr, ptr }, align 8
6948 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
6949 // store ptr %20, ptr %gep_, align 8
6950 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
6951 // store ptr %21, ptr %gep_8, align 8
6952 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
6953 //
6954 // OR
6955 //
6956 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
6958 StaleCI->getIterator());
6959 LLVMContext &Ctx = StaleCI->getParent()->getContext();
6960 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
6961 Type *TaskPtrTy = OMPBuilder.TaskPtr;
6962 Type *TaskTy = OMPBuilder.Task;
6963 auto ProxyFnTy =
6964 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
6965 /* isVarArg */ false);
6966 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
6967 ".omp_target_task_proxy_func",
6968 Builder.GetInsertBlock()->getModule());
6969 ProxyFn->getArg(0)->setName("thread.id");
6970 ProxyFn->getArg(1)->setName("task");
6971
6972 BasicBlock *EntryBB =
6973 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
6974 Builder.SetInsertPoint(EntryBB);
6975
6976 bool HasShareds = StaleCI->arg_size() > 1;
6977 // TODO: This is a temporary assert to prove to ourselves that
6978 // the outlined target launch function is always going to have
6979 // atmost two arguments if there is any data shared between
6980 // host and device.
6981 assert((!HasShareds || (StaleCI->arg_size() == 2)) &&
6982 "StaleCI with shareds should have exactly two arguments.");
6983 if (HasShareds) {
6984 auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
6985 assert(ArgStructAlloca &&
6986 "Unable to find the alloca instruction corresponding to arguments "
6987 "for extracted function");
6988 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
6989
6990 AllocaInst *NewArgStructAlloca =
6991 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
6992 Value *TaskT = ProxyFn->getArg(1);
6993 Value *ThreadId = ProxyFn->getArg(0);
6994 Value *SharedsSize =
6995 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
6996
6997 Value *Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
6998 LoadInst *LoadShared =
6999 Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
7000
7001 Builder.CreateMemCpy(
7002 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
7003 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
7004
7005 Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca});
7006 }
7007 Builder.CreateRetVoid();
7008 return ProxyFn;
7009}
7010
7012 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
7013 TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn,
7014 Constant *&OutlinedFnID, SmallVectorImpl<Value *> &Inputs,
7017
7018 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
7019 [&OMPBuilder, &Builder, &Inputs, &CBFunc,
7020 &ArgAccessorFuncCB](StringRef EntryFnName) {
7021 return createOutlinedFunction(OMPBuilder, Builder, EntryFnName, Inputs,
7022 CBFunc, ArgAccessorFuncCB);
7023 };
7024
7025 return OMPBuilder.emitTargetRegionFunction(
7026 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
7027 OutlinedFnID);
7028}
7029
7031 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
7034 bool HasNoWait) {
7035
7036 // The following explains the code-gen scenario for the `target` directive. A
7037 // similar scneario is followed for other device-related directives (e.g.
7038 // `target enter data`) but in similar fashion since we only need to emit task
7039 // that encapsulates the proper runtime call.
7040 //
7041 // When we arrive at this function, the target region itself has been
7042 // outlined into the function OutlinedFn.
7043 // So at ths point, for
7044 // --------------------------------------------------
7045 // void user_code_that_offloads(...) {
7046 // omp target depend(..) map(from:a) map(to:b, c)
7047 // a = b + c
7048 // }
7049 //
7050 // --------------------------------------------------
7051 //
7052 // we have
7053 //
7054 // --------------------------------------------------
7055 //
7056 // void user_code_that_offloads(...) {
7057 // %.offload_baseptrs = alloca [3 x ptr], align 8
7058 // %.offload_ptrs = alloca [3 x ptr], align 8
7059 // %.offload_mappers = alloca [3 x ptr], align 8
7060 // ;; target region has been outlined and now we need to
7061 // ;; offload to it via a target task.
7062 // }
7063 // void outlined_device_function(ptr a, ptr b, ptr c) {
7064 // *a = *b + *c
7065 // }
7066 //
7067 // We have to now do the following
7068 // (i) Make an offloading call to outlined_device_function using the OpenMP
7069 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
7070 // emitted by emitKernelLaunch
7071 // (ii) Create a task entry point function that calls kernel_launch_function
7072 // and is the entry point for the target task. See
7073 // '@.omp_target_task_proxy_func in the pseudocode below.
7074 // (iii) Create a task with the task entry point created in (ii)
7075 //
7076 // That is we create the following
7077 //
7078 // void user_code_that_offloads(...) {
7079 // %.offload_baseptrs = alloca [3 x ptr], align 8
7080 // %.offload_ptrs = alloca [3 x ptr], align 8
7081 // %.offload_mappers = alloca [3 x ptr], align 8
7082 //
7083 // %structArg = alloca { ptr, ptr, ptr }, align 8
7084 // %strucArg[0] = %.offload_baseptrs
7085 // %strucArg[1] = %.offload_ptrs
7086 // %strucArg[2] = %.offload_mappers
7087 // proxy_target_task = @__kmpc_omp_task_alloc(...,
7088 // @.omp_target_task_proxy_func)
7089 // memcpy(proxy_target_task->shareds, %structArg, sizeof(structArg))
7090 // dependencies_array = ...
7091 // ;; if nowait not present
7092 // call @__kmpc_omp_wait_deps(..., dependencies_array)
7093 // call @__kmpc_omp_task_begin_if0(...)
7094 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
7095 // %proxy_target_task) call @__kmpc_omp_task_complete_if0(...)
7096 // }
7097 //
7098 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
7099 // ptr %task) {
7100 // %structArg = alloca {ptr, ptr, ptr}
7101 // %shared_data = load (getelementptr %task, 0, 0)
7102 // mempcy(%structArg, %shared_data, sizeof(structArg))
7103 // kernel_launch_function(%thread.id, %structArg)
7104 // }
7105 //
7106 // We need the proxy function because the signature of the task entry point
7107 // expected by kmpc_omp_task is always the same and will be different from
7108 // that of the kernel_launch function.
7109 //
7110 // kernel_launch_function is generated by emitKernelLaunch and has the
7111 // always_inline attribute.
7112 // void kernel_launch_function(thread_id,
7113 // structArg) alwaysinline {
7114 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
7115 // offload_baseptrs = load(getelementptr structArg, 0, 0)
7116 // offload_ptrs = load(getelementptr structArg, 0, 1)
7117 // offload_mappers = load(getelementptr structArg, 0, 2)
7118 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
7119 // ; offload_mappers
7120 // call i32 @__tgt_target_kernel(...,
7121 // outlined_device_function,
7122 // ptr %kernel_args)
7123 // }
7124 // void outlined_device_function(ptr a, ptr b, ptr c) {
7125 // *a = *b + *c
7126 // }
7127 //
7128 BasicBlock *TargetTaskBodyBB =
7129 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
7130 BasicBlock *TargetTaskAllocaBB =
7131 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
7132
7133 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
7134 TargetTaskAllocaBB->begin());
7135 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
7136
7137 OutlineInfo OI;
7138 OI.EntryBB = TargetTaskAllocaBB;
7139 OI.OuterAllocaBB = AllocaIP.getBlock();
7140
7141 // Add the thread ID argument.
7144 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
7145
7146 Builder.restoreIP(TargetTaskBodyIP);
7147
7148 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
7149 return Err;
7150
7151 OI.ExitBB = Builder.saveIP().getBlock();
7152 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, HasNoWait,
7153 DeviceID](Function &OutlinedFn) mutable {
7154 assert(OutlinedFn.getNumUses() == 1 &&
7155 "there must be a single user for the outlined function");
7156
7157 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
7158 bool HasShareds = StaleCI->arg_size() > 1;
7159
7160 Function *ProxyFn = emitTargetTaskProxyFunction(*this, Builder, StaleCI);
7161
7162 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
7163 << "\n");
7164
7165 Builder.SetInsertPoint(StaleCI);
7166
7167 // Gather the arguments for emitting the runtime call.
7168 uint32_t SrcLocStrSize;
7169 Constant *SrcLocStr =
7171 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7172
7173 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
7174 //
7175 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
7176 // the DeviceID to the deferred task and also since
7177 // @__kmpc_omp_target_task_alloc creates an untied/async task.
7178 Function *TaskAllocFn =
7179 !HasNoWait ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
7181 OMPRTL___kmpc_omp_target_task_alloc);
7182
7183 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
7184 // call.
7185 Value *ThreadID = getOrCreateThreadID(Ident);
7186
7187 // Argument - `sizeof_kmp_task_t` (TaskSize)
7188 // Tasksize refers to the size in bytes of kmp_task_t data structure
7189 // including private vars accessed in task.
7190 // TODO: add kmp_task_t_with_privates (privates)
7191 Value *TaskSize =
7193
7194 // Argument - `sizeof_shareds` (SharedsSize)
7195 // SharedsSize refers to the shareds array size in the kmp_task_t data
7196 // structure.
7197 Value *SharedsSize = Builder.getInt64(0);
7198 if (HasShareds) {
7199 auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
7200 assert(ArgStructAlloca &&
7201 "Unable to find the alloca instruction corresponding to arguments "
7202 "for extracted function");
7203 auto *ArgStructType =
7204 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
7205 assert(ArgStructType && "Unable to find struct type corresponding to "
7206 "arguments for extracted function");
7207 SharedsSize =
7209 }
7210
7211 // Argument - `flags`
7212 // Task is tied iff (Flags & 1) == 1.
7213 // Task is untied iff (Flags & 1) == 0.
7214 // Task is final iff (Flags & 2) == 2.
7215 // Task is not final iff (Flags & 2) == 0.
7216 // A target task is not final and is untied.
7218
7219 // Emit the @__kmpc_omp_task_alloc runtime call
7220 // The runtime call returns a pointer to an area where the task captured
7221 // variables must be copied before the task is run (TaskData)
7222 CallInst *TaskData = nullptr;
7223
7224 SmallVector<llvm::Value *> TaskAllocArgs = {
7225 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
7226 /*flags=*/Flags,
7227 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
7228 /*task_func=*/ProxyFn};
7229
7230 if (HasNoWait)
7231 TaskAllocArgs.push_back(DeviceID);
7232
7233 TaskData = Builder.CreateCall(TaskAllocFn, TaskAllocArgs);
7234
7235 if (HasShareds) {
7236 Value *Shareds = StaleCI->getArgOperand(1);
7237 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
7238 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
7239 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
7240 SharedsSize);
7241 }
7242
7243 Value *DepArray = emitTaskDependencies(*this, Dependencies);
7244
7245 // ---------------------------------------------------------------
7246 // V5.2 13.8 target construct
7247 // If the nowait clause is present, execution of the target task
7248 // may be deferred. If the nowait clause is not present, the target task is
7249 // an included task.
7250 // ---------------------------------------------------------------
7251 // The above means that the lack of a nowait on the target construct
7252 // translates to '#pragma omp task if(0)'
7253 if (!HasNoWait) {
7254 if (DepArray) {
7255 Function *TaskWaitFn =
7256 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
7258 TaskWaitFn,
7259 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
7260 /*ndeps=*/Builder.getInt32(Dependencies.size()),
7261 /*dep_list=*/DepArray,
7262 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
7263 /*noalias_dep_list=*/
7265 }
7266 // Included task.
7267 Function *TaskBeginFn =
7268 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
7269 Function *TaskCompleteFn =
7270 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
7271 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
7272 CallInst *CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData});
7273 CI->setDebugLoc(StaleCI->getDebugLoc());
7274 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
7275 } else if (DepArray) {
7276 // HasNoWait - meaning the task may be deferred. Call
7277 // __kmpc_omp_task_with_deps if there are dependencies,
7278 // else call __kmpc_omp_task
7279 Function *TaskFn =
7280 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
7282 TaskFn,
7283 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
7284 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
7286 } else {
7287 // Emit the @__kmpc_omp_task runtime call to spawn the task
7288 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
7289 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
7290 }
7291
7292 StaleCI->eraseFromParent();
7293 for (Instruction *I : llvm::reverse(ToBeDeleted))
7294 I->eraseFromParent();
7295 };
7296 addOutlineInfo(std::move(OI));
7297
7298 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
7299 << *(Builder.GetInsertBlock()) << "\n");
7300 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
7302 << "\n");
7303 return Builder.saveIP();
7304}
7305
7307 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
7308 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, bool IsNonContiguous,
7309 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB,
7310 function_ref<Value *(unsigned int)> CustomMapperCB) {
7311 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info, IsNonContiguous,
7312 DeviceAddrCB, CustomMapperCB);
7313 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
7314}
7315
7316static void
7318 OpenMPIRBuilder::InsertPointTy AllocaIP, Function *OutlinedFn,
7319 Constant *OutlinedFnID, ArrayRef<int32_t> NumTeams,
7323 bool HasNoWait = false) {
7324 // Generate a function call to the host fallback implementation of the target
7325 // region. This is called by the host when no offload entry was generated for
7326 // the target region and when the offloading call fails at runtime.
7327 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
7329 Builder.restoreIP(IP);
7330 Builder.CreateCall(OutlinedFn, Args);
7331 return Builder.saveIP();
7332 };
7333
7334 bool HasDependencies = Dependencies.size() > 0;
7335 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
7336
7338
7339 auto TaskBodyCB =
7340 [&](Value *DeviceID, Value *RTLoc,
7341 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
7343 // emitKernelLaunch makes the necessary runtime call to offload the
7344 // kernel. We then outline all that code into a separate function
7345 // ('kernel_launch_function' in the pseudo code above). This function is
7346 // then called by the target task proxy function (see
7347 // '@.omp_target_task_proxy_func' in the pseudo code above)
7348 // "@.omp_target_task_proxy_func' is generated by
7349 // emitTargetTaskProxyFunction.
7350 if (OutlinedFnID)
7351 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
7352 EmitTargetCallFallbackCB, KArgs,
7353 DeviceID, RTLoc, TargetTaskAllocaIP);
7354 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
7355 // In this case, we execute the host implementation directly.
7356 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
7357 }();
7358
7359 if (!AfterIP)
7360 return AfterIP.takeError();
7361
7362 OMPBuilder.Builder.restoreIP(*AfterIP);
7363 return Error::success();
7364 };
7365
7366 // If we don't have an ID for the target region, it means an offload entry
7367 // wasn't created. In this case we just run the host fallback directly.
7368 if (!OutlinedFnID) {
7370 if (RequiresOuterTargetTask) {
7371 // Arguments that are intended to be directly forwarded to an
7372 // emitKernelLaunch call are pased as nullptr, since
7373 // OutlinedFnID=nullptr results in that call not being done.
7374 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
7375 /*RTLoc=*/nullptr, AllocaIP,
7376 Dependencies, HasNoWait);
7377 }
7378 return EmitTargetCallFallbackCB(Builder.saveIP());
7379 }();
7380
7381 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
7382 // produce any. The 'if' check enables accessing the returned value.
7383 if (AfterIP)
7384 Builder.restoreIP(*AfterIP);
7385 return;
7386 }
7387
7389 /*RequiresDevicePointerInfo=*/false,
7390 /*SeparateBeginEndCalls=*/true);
7391
7392 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
7394 OMPBuilder.emitOffloadingArraysAndArgs(AllocaIP, Builder.saveIP(), Info,
7395 RTArgs, MapInfo,
7396 /*IsNonContiguous=*/true,
7397 /*ForEndCall=*/false);
7398
7399 SmallVector<Value *, 3> NumTeamsC;
7400 SmallVector<Value *, 3> NumThreadsC;
7401 for (auto V : NumTeams)
7402 NumTeamsC.push_back(llvm::ConstantInt::get(Builder.getInt32Ty(), V));
7403 for (auto V : NumThreads)
7404 NumThreadsC.push_back(llvm::ConstantInt::get(Builder.getInt32Ty(), V));
7405
7406 unsigned NumTargetItems = Info.NumberOfPtrs;
7407 // TODO: Use correct device ID
7408 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
7409 uint32_t SrcLocStrSize;
7410 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
7411 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
7412 llvm::omp::IdentFlag(0), 0);
7413 // TODO: Use correct NumIterations
7414 Value *NumIterations = Builder.getInt64(0);
7415 // TODO: Use correct DynCGGroupMem
7416 Value *DynCGGroupMem = Builder.getInt32(0);
7417
7419 NumTargetItems, RTArgs, NumIterations, NumTeamsC, NumThreadsC,
7420 DynCGGroupMem, HasNoWait);
7421
7422 // The presence of certain clauses on the target directive require the
7423 // explicit generation of the target task.
7425 if (RequiresOuterTargetTask)
7426 return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP,
7427 Dependencies, HasNoWait);
7428
7429 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
7430 EmitTargetCallFallbackCB, KArgs,
7431 DeviceID, RTLoc, AllocaIP);
7432 }();
7433
7434 // Assume no error was returned because TaskBodyCB and
7435 // EmitTargetCallFallbackCB don't produce any. The 'if' check enables
7436 // accessing the returned value.
7437 if (AfterIP)
7438 Builder.restoreIP(*AfterIP);
7439}
7440
7442 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
7443 InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo,
7444 ArrayRef<int32_t> NumTeams, ArrayRef<int32_t> NumThreads,
7448 SmallVector<DependData> Dependencies, bool HasNowait) {
7449
7450 if (!updateToLocation(Loc))
7451 return InsertPointTy();
7452
7453 Builder.restoreIP(CodeGenIP);
7454
7455 Function *OutlinedFn;
7456 Constant *OutlinedFnID = nullptr;
7457 // The target region is outlined into its own function. The LLVM IR for
7458 // the target region itself is generated using the callbacks CBFunc
7459 // and ArgAccessorFuncCB
7461 *this, Builder, IsOffloadEntry, EntryInfo, OutlinedFn, OutlinedFnID,
7462 Args, CBFunc, ArgAccessorFuncCB))
7463 return Err;
7464
7465 // If we are not on the target device, then we need to generate code
7466 // to make a remote call (offload) to the previously outlined function
7467 // that represents the target region. Do that now.
7468 if (!Config.isTargetDevice())
7469 emitTargetCall(*this, Builder, AllocaIP, OutlinedFn, OutlinedFnID, NumTeams,
7470 NumThreads, Args, GenMapInfoCB, Dependencies, HasNowait);
7471 return Builder.saveIP();
7472}
7473
7474std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
7475 StringRef FirstSeparator,
7476 StringRef Separator) {
7477 SmallString<128> Buffer;
7479 StringRef Sep = FirstSeparator;
7480 for (StringRef Part : Parts) {
7481 OS << Sep << Part;
7482 Sep = Separator;
7483 }
7484 return OS.str().str();
7485}
7486
7487std::string
7489 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
7490 Config.separator());
7491}
7492
7495 unsigned AddressSpace) {
7496 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
7497 if (Elem.second) {
7498 assert(Elem.second->getValueType() == Ty &&
7499 "OMP internal variable has different type than requested");
7500 } else {
7501 // TODO: investigate the appropriate linkage type used for the global
7502 // variable for possibly changing that to internal or private, or maybe
7503 // create different versions of the function for different OMP internal
7504 // variables.
7505 auto Linkage = this->M.getTargetTriple().rfind("wasm32") == 0
7508 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
7509 Constant::getNullValue(Ty), Elem.first(),
7510 /*InsertBefore=*/nullptr,
7512 const DataLayout &DL = M.getDataLayout();
7513 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
7514 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
7515 GV->setAlignment(std::max(TypeAlign, PtrAlign));
7516 Elem.second = GV;
7517 }
7518
7519 return Elem.second;
7520}
7521
7522Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
7523 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
7524 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
7525 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
7526}
7527
7530 Value *Null =
7531 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
7532 Value *SizeGep =
7533 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
7534 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
7535 return SizePtrToInt;
7536}
7537
7540 std::string VarName) {
7541 llvm::Constant *MaptypesArrayInit =
7543 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
7544 M, MaptypesArrayInit->getType(),
7545 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
7546 VarName);
7547 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
7548 return MaptypesArrayGlobal;
7549}
7550
7552 InsertPointTy AllocaIP,
7553 unsigned NumOperands,
7554 struct MapperAllocas &MapperAllocas) {
7555 if (!updateToLocation(Loc))
7556 return;
7557
7558 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
7559 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
7560 Builder.restoreIP(AllocaIP);
7561 AllocaInst *ArgsBase = Builder.CreateAlloca(
7562 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
7563 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
7564 ".offload_ptrs");
7565 AllocaInst *ArgSizes = Builder.CreateAlloca(
7566 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
7567 Builder.restoreIP(Loc.IP);
7568 MapperAllocas.ArgsBase = ArgsBase;
7569 MapperAllocas.Args = Args;
7570 MapperAllocas.ArgSizes = ArgSizes;
7571}
7572
7574 Function *MapperFunc, Value *SrcLocInfo,
7575 Value *MaptypesArg, Value *MapnamesArg,
7577 int64_t DeviceID, unsigned NumOperands) {
7578 if (!updateToLocation(Loc))
7579 return;
7580
7581 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
7582 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
7583 Value *ArgsBaseGEP =
7585 {Builder.getInt32(0), Builder.getInt32(0)});
7586 Value *ArgsGEP =
7588 {Builder.getInt32(0), Builder.getInt32(0)});
7589 Value *ArgSizesGEP =
7591 {Builder.getInt32(0), Builder.getInt32(0)});
7592 Value *NullPtr =
7593 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
7594 Builder.CreateCall(MapperFunc,
7595 {SrcLocInfo, Builder.getInt64(DeviceID),
7596 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP,
7597 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr});
7598}
7599
7601 TargetDataRTArgs &RTArgs,
7602 TargetDataInfo &Info,
7603 bool ForEndCall) {
7604 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
7605 "expected region end call to runtime only when end call is separate");
7606 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
7607 auto VoidPtrTy = UnqualPtrTy;
7608 auto VoidPtrPtrTy = UnqualPtrTy;
7609 auto Int64Ty = Type::getInt64Ty(M.getContext());
7610 auto Int64PtrTy = UnqualPtrTy;
7611
7612 if (!Info.NumberOfPtrs) {
7613 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7614 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7615 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
7616 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
7617 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
7618 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7619 return;
7620 }
7621
7623 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
7624 Info.RTArgs.BasePointersArray,
7625 /*Idx0=*/0, /*Idx1=*/0);
7627 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
7628 /*Idx0=*/0,
7629 /*Idx1=*/0);
7631 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
7632 /*Idx0=*/0, /*Idx1=*/0);
7634 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
7635 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
7636 : Info.RTArgs.MapTypesArray,
7637 /*Idx0=*/0,
7638 /*Idx1=*/0);
7639
7640 // Only emit the mapper information arrays if debug information is
7641 // requested.
7642 if (!Info.EmitDebug)
7643 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
7644 else
7646 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
7647 /*Idx0=*/0,
7648 /*Idx1=*/0);
7649 // If there is no user-defined mapper, set the mapper array to nullptr to
7650 // avoid an unnecessary data privatization
7651 if (!Info.HasMapper)
7652 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7653 else
7654 RTArgs.MappersArray =
7655 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
7656}
7657
7659 InsertPointTy CodeGenIP,
7660 MapInfosTy &CombinedInfo,
7661 TargetDataInfo &Info) {
7663 CombinedInfo.NonContigInfo;
7664
7665 // Build an array of struct descriptor_dim and then assign it to
7666 // offload_args.
7667 //
7668 // struct descriptor_dim {
7669 // uint64_t offset;
7670 // uint64_t count;
7671 // uint64_t stride
7672 // };
7673 Type *Int64Ty = Builder.getInt64Ty();
7675 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
7676 "struct.descriptor_dim");
7677
7678 enum { OffsetFD = 0, CountFD, StrideFD };
7679 // We need two index variable here since the size of "Dims" is the same as
7680 // the size of Components, however, the size of offset, count, and stride is
7681 // equal to the size of base declaration that is non-contiguous.
7682 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
7683 // Skip emitting ir if dimension size is 1 since it cannot be
7684 // non-contiguous.
7685 if (NonContigInfo.Dims[I] == 1)
7686 continue;
7687 Builder.restoreIP(AllocaIP);
7688 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
7689 AllocaInst *DimsAddr =
7690 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
7691 Builder.restoreIP(CodeGenIP);
7692 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
7693 unsigned RevIdx = EE - II - 1;
7694 Value *DimsLVal = Builder.CreateInBoundsGEP(
7695 DimsAddr->getAllocatedType(), DimsAddr,
7696 {Builder.getInt64(0), Builder.getInt64(II)});
7697 // Offset
7698 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
7700 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
7701 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
7702 // Count
7703 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
7705 NonContigInfo.Counts[L][RevIdx], CountLVal,
7706 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
7707 // Stride
7708 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
7710 NonContigInfo.Strides[L][RevIdx], StrideLVal,
7711 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
7712 }
7713 // args[I] = &dims
7714 Builder.restoreIP(CodeGenIP);
7716 DimsAddr, Builder.getPtrTy());
7718 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
7719 Info.RTArgs.PointersArray, 0, I);
7722 ++L;
7723 }
7724}
7725
7726void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
7727 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
7728 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
7729 BasicBlock *ExitBB, bool IsInit) {
7730 StringRef Prefix = IsInit ? ".init" : ".del";
7731
7732 // Evaluate if this is an array section.
7734 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
7735 Value *IsArray =
7736 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
7737 Value *DeleteBit = Builder.CreateAnd(
7738 MapType,
7740 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7741 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
7742 Value *DeleteCond;
7743 Value *Cond;
7744 if (IsInit) {
7745 // base != begin?
7746 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
7747 // IsPtrAndObj?
7748 Value *PtrAndObjBit = Builder.CreateAnd(
7749 MapType,
7751 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7752 OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ)));
7753 PtrAndObjBit = Builder.CreateIsNotNull(PtrAndObjBit);
7754 BaseIsBegin = Builder.CreateAnd(BaseIsBegin, PtrAndObjBit);
7755 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
7756 DeleteCond = Builder.CreateIsNull(
7757 DeleteBit,
7758 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
7759 } else {
7760 Cond = IsArray;
7761 DeleteCond = Builder.CreateIsNotNull(
7762 DeleteBit,
7763 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
7764 }
7765 Cond = Builder.CreateAnd(Cond, DeleteCond);
7766 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
7767
7768 emitBlock(BodyBB, MapperFn);
7769 // Get the array size by multiplying element size and element number (i.e., \p
7770 // Size).
7771 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
7772 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
7773 // memory allocation/deletion purpose only.
7774 Value *MapTypeArg = Builder.CreateAnd(
7775 MapType,
7777 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7778 OpenMPOffloadMappingFlags::OMP_MAP_TO |
7779 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
7780 MapTypeArg = Builder.CreateOr(
7781 MapTypeArg,
7783 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7784 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
7785
7786 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
7787 // data structure.
7788 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
7789 ArraySize, MapTypeArg, MapName};
7791 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
7792 OffloadingArgs);
7793}
7794
7796 function_ref<MapInfosTy &(InsertPointTy CodeGenIP, llvm::Value *PtrPHI,
7797 llvm::Value *BeginArg)>
7798 GenMapInfoCB,
7799 Type *ElemTy, StringRef FuncName,
7800 function_ref<bool(unsigned int, Function **)> CustomMapperCB) {
7801 SmallVector<Type *> Params;
7802 Params.emplace_back(Builder.getPtrTy());
7803 Params.emplace_back(Builder.getPtrTy());
7804 Params.emplace_back(Builder.getPtrTy());
7807 Params.emplace_back(Builder.getPtrTy());
7808
7809 auto *FnTy =
7810 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
7811
7812 SmallString<64> TyStr;
7813 raw_svector_ostream Out(TyStr);
7814 Function *MapperFn =
7816 MapperFn->addFnAttr(Attribute::NoInline);
7817 MapperFn->addFnAttr(Attribute::NoUnwind);
7818 MapperFn->addParamAttr(0, Attribute::NoUndef);
7819 MapperFn->addParamAttr(1, Attribute::NoUndef);
7820 MapperFn->addParamAttr(2, Attribute::NoUndef);
7821 MapperFn->addParamAttr(3, Attribute::NoUndef);
7822 MapperFn->addParamAttr(4, Attribute::NoUndef);
7823 MapperFn->addParamAttr(5, Attribute::NoUndef);
7824
7825 // Start the mapper function code generation.
7826 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
7827 auto SavedIP = Builder.saveIP();
7828 Builder.SetInsertPoint(EntryBB);
7829
7830 Value *MapperHandle = MapperFn->getArg(0);
7831 Value *BaseIn = MapperFn->getArg(1);
7832 Value *BeginIn = MapperFn->getArg(2);
7833 Value *Size = MapperFn->getArg(3);
7834 Value *MapType = MapperFn->getArg(4);
7835 Value *MapName = MapperFn->getArg(5);
7836
7837 // Compute the starting and end addresses of array elements.
7838 // Prepare common arguments for array initiation and deletion.
7839 // Convert the size in bytes into the number of array elements.
7840 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
7842 Value *PtrBegin = Builder.CreateBitCast(BeginIn, Builder.getPtrTy());
7843 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
7844
7845 // Emit array initiation if this is an array section and \p MapType indicates
7846 // that memory allocation is required.
7847 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
7848 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
7849 MapType, MapName, ElementSize, HeadBB,
7850 /*IsInit=*/true);
7851
7852 // Emit a for loop to iterate through SizeArg of elements and map all of them.
7853
7854 // Emit the loop header block.
7855 emitBlock(HeadBB, MapperFn);
7856 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
7857 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
7858 // Evaluate whether the initial condition is satisfied.
7859 Value *IsEmpty =
7860 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
7861 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
7862
7863 // Emit the loop body block.
7864 emitBlock(BodyBB, MapperFn);
7865 BasicBlock *LastBB = BodyBB;
7866 PHINode *PtrPHI =
7867 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
7868 PtrPHI->addIncoming(PtrBegin, HeadBB);
7869
7870 // Get map clause information. Fill up the arrays with all mapped variables.
7871 MapInfosTy &Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
7872
7873 // Call the runtime API __tgt_mapper_num_components to get the number of
7874 // pre-existing components.
7875 Value *OffloadingArgs[] = {MapperHandle};
7876 Value *PreviousSize = Builder.CreateCall(
7877 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
7878 OffloadingArgs);
7879 Value *ShiftedPreviousSize =
7881
7882 // Fill up the runtime mapper handle for all components.
7883 for (unsigned I = 0; I < Info.BasePointers.size(); ++I) {
7884 Value *CurBaseArg =
7885 Builder.CreateBitCast(Info.BasePointers[I], Builder.getPtrTy());
7886 Value *CurBeginArg =
7888 Value *CurSizeArg = Info.Sizes[I];
7889 Value *CurNameArg = Info.Names.size()
7890 ? Info.Names[I]
7892
7893 // Extract the MEMBER_OF field from the map type.
7894 Value *OriMapType = Builder.getInt64(
7895 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7896 Info.Types[I]));
7897 Value *MemberMapType =
7898 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
7899
7900 // Combine the map type inherited from user-defined mapper with that
7901 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
7902 // bits of the \a MapType, which is the input argument of the mapper
7903 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
7904 // bits of MemberMapType.
7905 // [OpenMP 5.0], 1.2.6. map-type decay.
7906 // | alloc | to | from | tofrom | release | delete
7907 // ----------------------------------------------------------
7908 // alloc | alloc | alloc | alloc | alloc | release | delete
7909 // to | alloc | to | alloc | to | release | delete
7910 // from | alloc | alloc | from | from | release | delete
7911 // tofrom | alloc | to | from | tofrom | release | delete
7912 Value *LeftToFrom = Builder.CreateAnd(
7913 MapType,
7915 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7916 OpenMPOffloadMappingFlags::OMP_MAP_TO |
7917 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
7918 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
7919 BasicBlock *AllocElseBB =
7920 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
7921 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
7922 BasicBlock *ToElseBB =
7923 BasicBlock::Create(M.getContext(), "omp.type.to.else");
7924 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
7925 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
7926 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
7927 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
7928 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
7929 emitBlock(AllocBB, MapperFn);
7930 Value *AllocMapType = Builder.CreateAnd(
7931 MemberMapType,
7933 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7934 OpenMPOffloadMappingFlags::OMP_MAP_TO |
7935 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
7936 Builder.CreateBr(EndBB);
7937 emitBlock(AllocElseBB, MapperFn);
7938 Value *IsTo = Builder.CreateICmpEQ(
7939 LeftToFrom,
7941 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7942 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
7943 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
7944 // In case of to, clear OMP_MAP_FROM.
7945 emitBlock(ToBB, MapperFn);
7946 Value *ToMapType = Builder.CreateAnd(
7947 MemberMapType,
7949 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7950 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
7951 Builder.CreateBr(EndBB);
7952 emitBlock(ToElseBB, MapperFn);
7953 Value *IsFrom = Builder.CreateICmpEQ(
7954 LeftToFrom,
7956 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7957 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
7958 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
7959 // In case of from, clear OMP_MAP_TO.
7960 emitBlock(FromBB, MapperFn);
7961 Value *FromMapType = Builder.CreateAnd(
7962 MemberMapType,
7964 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7965 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
7966 // In case of tofrom, do nothing.
7967 emitBlock(EndBB, MapperFn);
7968 LastBB = EndBB;
7969 PHINode *CurMapType =
7970 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
7971 CurMapType->addIncoming(AllocMapType, AllocBB);
7972 CurMapType->addIncoming(ToMapType, ToBB);
7973 CurMapType->addIncoming(FromMapType, FromBB);
7974 CurMapType->addIncoming(MemberMapType, ToElseBB);
7975
7976 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
7977 CurSizeArg, CurMapType, CurNameArg};
7978 Function *ChildMapperFn = nullptr;
7979 if (CustomMapperCB && CustomMapperCB(I, &ChildMapperFn)) {
7980 // Call the corresponding mapper function.
7981 Builder.CreateCall(ChildMapperFn, OffloadingArgs)->setDoesNotThrow();
7982 } else {
7983 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
7984 // data structure.
7986 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
7987 OffloadingArgs);
7988 }
7989 }
7990
7991 // Update the pointer to point to the next element that needs to be mapped,
7992 // and check whether we have mapped all elements.
7993 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
7994 "omp.arraymap.next");
7995 PtrPHI->addIncoming(PtrNext, LastBB);
7996 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
7997 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
7998 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
7999
8000 emitBlock(ExitBB, MapperFn);
8001 // Emit array deletion if this is an array section and \p MapType indicates
8002 // that deletion is required.
8003 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8004 MapType, MapName, ElementSize, DoneBB,
8005 /*IsInit=*/false);
8006
8007 // Emit the function exit block.
8008 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
8009
8011 Builder.restoreIP(SavedIP);
8012 return MapperFn;
8013}
8014
8016 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
8017 TargetDataInfo &Info, bool IsNonContiguous,
8018 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
8019 function_ref<Value *(unsigned int)> CustomMapperCB) {
8020
8021 // Reset the array information.
8022 Info.clearArrayInfo();
8023 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
8024
8025 if (Info.NumberOfPtrs == 0)
8026 return;
8027
8028 Builder.restoreIP(AllocaIP);
8029 // Detect if we have any capture size requiring runtime evaluation of the
8030 // size so that a constant array could be eventually used.
8031 ArrayType *PointerArrayType =
8032 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
8033
8034 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
8035 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
8036
8037 Info.RTArgs.PointersArray = Builder.CreateAlloca(
8038 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
8039 AllocaInst *MappersArray = Builder.CreateAlloca(
8040 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
8041 Info.RTArgs.MappersArray = MappersArray;
8042
8043 // If we don't have any VLA types or other types that require runtime
8044 // evaluation, we can use a constant array for the map sizes, otherwise we
8045 // need to fill up the arrays as we do for the pointers.
8046 Type *Int64Ty = Builder.getInt64Ty();
8047 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
8048 ConstantInt::get(Int64Ty, 0));
8049 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
8050 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
8051 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
8052 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
8053 if (IsNonContiguous &&
8054 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8055 CombinedInfo.Types[I] &
8056 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
8057 ConstSizes[I] =
8058 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
8059 else
8060 ConstSizes[I] = CI;
8061 continue;
8062 }
8063 }
8064 RuntimeSizes.set(I);
8065 }
8066
8067 if (RuntimeSizes.all()) {
8068 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
8069 Info.RTArgs.SizesArray = Builder.CreateAlloca(
8070 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
8071 Builder.restoreIP(CodeGenIP);
8072 } else {
8073 auto *SizesArrayInit = ConstantArray::get(
8074 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
8075 std::string Name = createPlatformSpecificName({"offload_sizes"});
8076 auto *SizesArrayGbl =
8077 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
8078 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
8079 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
8080
8081 if (!RuntimeSizes.any()) {
8082 Info.RTArgs.SizesArray = SizesArrayGbl;
8083 } else {
8084 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
8085 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
8086 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
8088 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
8089 Buffer->setAlignment(OffloadSizeAlign);
8090 Builder.restoreIP(CodeGenIP);
8092 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
8093 SizesArrayGbl, OffloadSizeAlign,
8095 IndexSize,
8096 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
8097
8098 Info.RTArgs.SizesArray = Buffer;
8099 }
8100 Builder.restoreIP(CodeGenIP);
8101 }
8102
8103 // The map types are always constant so we don't need to generate code to
8104 // fill arrays. Instead, we create an array constant.
8106 for (auto mapFlag : CombinedInfo.Types)
8107 Mapping.push_back(
8108 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8109 mapFlag));
8110 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
8111 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
8112 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
8113
8114 // The information types are only built if provided.
8115 if (!CombinedInfo.Names.empty()) {
8116 std::string MapnamesName = createPlatformSpecificName({"offload_mapnames"});
8117 auto *MapNamesArrayGbl =
8118 createOffloadMapnames(CombinedInfo.Names, MapnamesName);
8119 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
8120 Info.EmitDebug = true;
8121 } else {
8122 Info.RTArgs.MapNamesArray =
8124 Info.EmitDebug = false;
8125 }
8126
8127 // If there's a present map type modifier, it must not be applied to the end
8128 // of a region, so generate a separate map type array in that case.
8129 if (Info.separateBeginEndCalls()) {
8130 bool EndMapTypesDiffer = false;
8131 for (uint64_t &Type : Mapping) {
8132 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8133 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
8134 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8135 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
8136 EndMapTypesDiffer = true;
8137 }
8138 }
8139 if (EndMapTypesDiffer) {
8140 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
8141 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
8142 }
8143 }
8144
8145 PointerType *PtrTy = Builder.getPtrTy();
8146 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
8147 Value *BPVal = CombinedInfo.BasePointers[I];
8149 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
8150 0, I);
8151 Builder.CreateAlignedStore(BPVal, BP,
8153
8154 if (Info.requiresDevicePointerInfo()) {
8155 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
8156 CodeGenIP = Builder.saveIP();
8157 Builder.restoreIP(AllocaIP);
8158 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
8159 Builder.restoreIP(CodeGenIP);
8160 if (DeviceAddrCB)
8161 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
8162 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
8163 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
8164 if (DeviceAddrCB)
8165 DeviceAddrCB(I, BP);
8166 }
8167 }
8168
8169 Value *PVal = CombinedInfo.Pointers[I];
8171 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
8172 I);
8173 // TODO: Check alignment correct.
8176
8177 if (RuntimeSizes.test(I)) {
8179 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
8180 /*Idx0=*/0,
8181 /*Idx1=*/I);
8183 Int64Ty,
8184 /*isSigned=*/true),
8185 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
8186 }
8187 // Fill up the mapper array.
8188 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
8189 Value *MFunc = ConstantPointerNull::get(PtrTy);
8190 if (CustomMapperCB)
8191 if (Value *CustomMFunc = CustomMapperCB(I))
8192 MFunc = Builder.CreatePointerCast(CustomMFunc, PtrTy);
8194 MappersArray->getAllocatedType(), MappersArray,
8195 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
8197 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
8198 }
8199
8200 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
8201 Info.NumberOfPtrs == 0)
8202 return;
8203 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
8204}
8205
8208
8209 if (!CurBB || CurBB->getTerminator()) {
8210 // If there is no insert point or the previous block is already
8211 // terminated, don't touch it.
8212 } else {
8213 // Otherwise, create a fall-through branch.
8215 }
8216
8218}
8219
8221 bool IsFinished) {
8223
8224 // Fall out of the current block (if necessary).
8225 emitBranch(BB);
8226
8227 if (IsFinished && BB->use_empty()) {
8228 BB->eraseFromParent();
8229 return;
8230 }
8231
8232 // Place the block after the current block, if possible, or else at
8233 // the end of the function.
8234 if (CurBB && CurBB->getParent())
8235 CurFn->insert(std::next(CurBB->getIterator()), BB);
8236 else
8237 CurFn->insert(CurFn->end(), BB);
8239}
8240
8242 BodyGenCallbackTy ElseGen,
8243 InsertPointTy AllocaIP) {
8244 // If the condition constant folds and can be elided, try to avoid emitting
8245 // the condition and the dead arm of the if/else.
8246 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
8247 auto CondConstant = CI->getSExtValue();
8248 if (CondConstant)
8249 return ThenGen(AllocaIP, Builder.saveIP());
8250
8251 return ElseGen(AllocaIP, Builder.saveIP());
8252 }
8253
8255
8256 // Otherwise, the condition did not fold, or we couldn't elide it. Just
8257 // emit the conditional branch.
8258 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
8259 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
8260 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
8261 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
8262 // Emit the 'then' code.
8263 emitBlock(ThenBlock, CurFn);
8264 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
8265 return Err;
8266 emitBranch(ContBlock);
8267 // Emit the 'else' code if present.
8268 // There is no need to emit line number for unconditional branch.
8269 emitBlock(ElseBlock, CurFn);
8270 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
8271 return Err;
8272 // There is no need to emit line number for unconditional branch.
8273 emitBranch(ContBlock);
8274 // Emit the continuation block for code after the if.
8275 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
8276 return Error::success();
8277}
8278
8279bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
8280 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
8283 "Unexpected Atomic Ordering.");
8284
8285 bool Flush = false;
8287
8288 switch (AK) {
8289 case Read:
8292 FlushAO = AtomicOrdering::Acquire;
8293 Flush = true;
8294 }
8295 break;
8296 case Write:
8297 case Compare:
8298 case Update:
8301 FlushAO = AtomicOrdering::Release;
8302 Flush = true;
8303 }
8304 break;
8305 case Capture:
8306 switch (AO) {
8308 FlushAO = AtomicOrdering::Acquire;
8309 Flush = true;
8310 break;
8312 FlushAO = AtomicOrdering::Release;
8313 Flush = true;
8314 break;
8318 Flush = true;
8319 break;
8320 default:
8321 // do nothing - leave silently.
8322 break;
8323 }
8324 }
8325
8326 if (Flush) {
8327 // Currently Flush RT call still doesn't take memory_ordering, so for when
8328 // that happens, this tries to do the resolution of which atomic ordering
8329 // to use with but issue the flush call
8330 // TODO: pass `FlushAO` after memory ordering support is added
8331 (void)FlushAO;
8332 emitFlush(Loc);
8333 }
8334
8335 // for AO == AtomicOrdering::Monotonic and all other case combinations
8336 // do nothing
8337 return Flush;
8338}
8339
8343 AtomicOrdering AO) {
8344 if (!updateToLocation(Loc))
8345 return Loc.IP;
8346
8347 assert(X.Var->getType()->isPointerTy() &&
8348 "OMP Atomic expects a pointer to target memory");
8349 Type *XElemTy = X.ElemTy;
8350 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8351 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
8352 "OMP atomic read expected a scalar type");
8353
8354 Value *XRead = nullptr;
8355
8356 if (XElemTy->isIntegerTy()) {
8357 LoadInst *XLD =
8358 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
8359 XLD->setAtomic(AO);
8360 XRead = cast<Value>(XLD);
8361 } else if (XElemTy->isStructTy()) {
8362 // FIXME: Add checks to ensure __atomic_load is emitted iff the
8363 // target does not support `atomicrmw` of the size of the struct
8364 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
8365 OldVal->setAtomic(AO);
8366 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
8367 unsigned LoadSize =
8368 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
8369 OpenMPIRBuilder::AtomicInfo atomicInfo(
8370 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
8371 OldVal->getAlign(), true /* UseLibcall */, X.Var);
8372 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
8373 XRead = AtomicLoadRes.first;
8374 OldVal->eraseFromParent();
8375 } else {
8376 // We need to perform atomic op as integer
8377 IntegerType *IntCastTy =
8379 LoadInst *XLoad =
8380 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
8381 XLoad->setAtomic(AO);
8382 if (XElemTy->isFloatingPointTy()) {
8383 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
8384 } else {
8385 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
8386 }
8387 }
8388 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
8389 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
8390 return Builder.saveIP();
8391}
8392
8395 AtomicOpValue &X, Value *Expr,
8396 AtomicOrdering AO) {
8397 if (!updateToLocation(Loc))
8398 return Loc.IP;
8399
8400 assert(X.Var->getType()->isPointerTy() &&
8401 "OMP Atomic expects a pointer to target memory");
8402 Type *XElemTy = X.ElemTy;
8403 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8404 XElemTy->isPointerTy()) &&
8405 "OMP atomic write expected a scalar type");
8406
8407 if (XElemTy->isIntegerTy()) {
8408 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
8409 XSt->setAtomic(AO);
8410 } else {
8411 // We need to bitcast and perform atomic op as integers
8412 IntegerType *IntCastTy =
8414 Value *ExprCast =
8415 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
8416 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
8417 XSt->setAtomic(AO);
8418 }
8419
8420 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
8421 return Builder.saveIP();
8422}
8423
8425 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
8426 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
8427 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr) {
8428 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
8429 if (!updateToLocation(Loc))
8430 return Loc.IP;
8431
8432 LLVM_DEBUG({
8433 Type *XTy = X.Var->getType();
8434 assert(XTy->isPointerTy() &&
8435 "OMP Atomic expects a pointer to target memory");
8436 Type *XElemTy = X.ElemTy;
8437 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8438 XElemTy->isPointerTy()) &&
8439 "OMP atomic update expected a scalar type");
8440 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
8441 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
8442 "OpenMP atomic does not support LT or GT operations");
8443 });
8444
8446 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp,
8447 X.IsVolatile, IsXBinopExpr);
8448 if (!AtomicResult)
8449 return AtomicResult.takeError();
8450 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
8451 return Builder.saveIP();
8452}
8453
8454// FIXME: Duplicating AtomicExpand
8455Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
8456 AtomicRMWInst::BinOp RMWOp) {
8457 switch (RMWOp) {
8458 case AtomicRMWInst::Add:
8459 return Builder.CreateAdd(Src1, Src2);
8460 case AtomicRMWInst::Sub:
8461 return Builder.CreateSub(Src1, Src2);
8462 case AtomicRMWInst::And:
8463 return Builder.CreateAnd(Src1, Src2);
8465 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
8466 case AtomicRMWInst::Or:
8467 return Builder.CreateOr(Src1, Src2);
8468 case AtomicRMWInst::Xor:
8469 return Builder.CreateXor(Src1, Src2);
8474 case AtomicRMWInst::Max:
8475 case AtomicRMWInst::Min:
8484 llvm_unreachable("Unsupported atomic update operation");
8485 }
8486 llvm_unreachable("Unsupported atomic update operation");
8487}
8488
8489Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
8490 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
8492 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr) {
8493 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
8494 // or a complex datatype.
8495 bool emitRMWOp = false;
8496 switch (RMWOp) {
8497 case AtomicRMWInst::Add:
8498 case AtomicRMWInst::And:
8500 case AtomicRMWInst::Or:
8501 case AtomicRMWInst::Xor:
8503 emitRMWOp = XElemTy;
8504 break;
8505 case AtomicRMWInst::Sub:
8506 emitRMWOp = (IsXBinopExpr && XElemTy);
8507 break;
8508 default:
8509 emitRMWOp = false;
8510 }
8511 emitRMWOp &= XElemTy->isIntegerTy();
8512
8513 std::pair<Value *, Value *> Res;
8514 if (emitRMWOp) {
8515 Res.first = Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
8516 // not needed except in case of postfix captures. Generate anyway for
8517 // consistency with the else part. Will be removed with any DCE pass.
8518 // AtomicRMWInst::Xchg does not have a coressponding instruction.
8519 if (RMWOp == AtomicRMWInst::Xchg)
8520 Res.second = Res.first;
8521 else
8522 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
8523 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
8524 XElemTy->isStructTy()) {
8525 LoadInst *OldVal =
8526 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
8527 OldVal->setAtomic(AO);
8528 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
8529 unsigned LoadSize =
8530 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
8531
8532 OpenMPIRBuilder::AtomicInfo atomicInfo(
8533 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
8534 OldVal->getAlign(), true /* UseLibcall */, X);
8535 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
8537 Instruction *CurBBTI = CurBB->getTerminator();
8538 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8539 BasicBlock *ExitBB =
8540 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
8541 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
8542 X->getName() + ".atomic.cont");
8543 ContBB->getTerminator()->eraseFromParent();
8544 Builder.restoreIP(AllocaIP);
8545 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
8546 NewAtomicAddr->setName(X->getName() + "x.new.val");
8547 Builder.SetInsertPoint(ContBB);
8548 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
8549 PHI->addIncoming(AtomicLoadRes.first, CurBB);
8550 Value *OldExprVal = PHI;
8551 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
8552 if (!CBResult)
8553 return CBResult.takeError();
8554 Value *Upd = *CBResult;
8555 Builder.CreateStore(Upd, NewAtomicAddr);
8558 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
8559 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
8560 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
8561 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
8562 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
8563 OldVal->eraseFromParent();
8564 Res.first = OldExprVal;
8565 Res.second = Upd;
8566
8567 if (UnreachableInst *ExitTI =
8568 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8569 CurBBTI->eraseFromParent();
8570 Builder.SetInsertPoint(ExitBB);
8571 } else {
8572 Builder.SetInsertPoint(ExitTI);
8573 }
8574 } else {
8575 IntegerType *IntCastTy =
8577 LoadInst *OldVal =
8578 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
8579 OldVal->setAtomic(AO);
8580 // CurBB
8581 // | /---\
8582 // ContBB |
8583 // | \---/
8584 // ExitBB
8586 Instruction *CurBBTI = CurBB->getTerminator();
8587 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8588 BasicBlock *ExitBB =
8589 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
8590 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
8591 X->getName() + ".atomic.cont");
8592 ContBB->getTerminator()->eraseFromParent();
8593 Builder.restoreIP(AllocaIP);
8594 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
8595 NewAtomicAddr->setName(X->getName() + "x.new.val");
8596 Builder.SetInsertPoint(ContBB);
8597 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
8598 PHI->addIncoming(OldVal, CurBB);
8599 bool IsIntTy = XElemTy->isIntegerTy();
8600 Value *OldExprVal = PHI;
8601 if (!IsIntTy) {
8602 if (XElemTy->isFloatingPointTy()) {
8603 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
8604 X->getName() + ".atomic.fltCast");
8605 } else {
8606 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
8607 X->getName() + ".atomic.ptrCast");
8608 }
8609 }
8610
8611 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
8612 if (!CBResult)
8613 return CBResult.takeError();
8614 Value *Upd = *CBResult;
8615 Builder.CreateStore(Upd, NewAtomicAddr);
8616 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
8620 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
8621 Result->setVolatile(VolatileX);
8622 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
8623 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8624 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
8625 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
8626
8627 Res.first = OldExprVal;
8628 Res.second = Upd;
8629
8630 // set Insertion point in exit block
8631 if (UnreachableInst *ExitTI =
8632 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8633 CurBBTI->eraseFromParent();
8634 Builder.SetInsertPoint(ExitBB);
8635 } else {
8636 Builder.SetInsertPoint(ExitTI);
8637 }
8638 }
8639
8640 return Res;
8641}
8642
8644 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
8645 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
8647 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr) {
8648 if (!updateToLocation(Loc))
8649 return Loc.IP;
8650
8651 LLVM_DEBUG({
8652 Type *XTy = X.Var->getType();
8653 assert(XTy->isPointerTy() &&
8654 "OMP Atomic expects a pointer to target memory");
8655 Type *XElemTy = X.ElemTy;
8656 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8657 XElemTy->isPointerTy()) &&
8658 "OMP atomic capture expected a scalar type");
8659 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
8660 "OpenMP atomic does not support LT or GT operations");
8661 });
8662
8663 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
8664 // 'x' is simply atomically rewritten with 'expr'.
8665 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
8667 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp,
8668 X.IsVolatile, IsXBinopExpr);
8669 if (!AtomicResult)
8670 return AtomicResult.takeError();
8671 Value *CapturedVal =
8672 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
8673 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
8674
8675 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
8676 return Builder.saveIP();
8677}
8678
8682 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
8683 bool IsFailOnly) {
8684
8686 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
8687 IsPostfixUpdate, IsFailOnly, Failure);
8688}
8689
8693 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
8694 bool IsFailOnly, AtomicOrdering Failure) {
8695
8696 if (!updateToLocation(Loc))
8697 return Loc.IP;
8698
8699 assert(X.Var->getType()->isPointerTy() &&
8700 "OMP atomic expects a pointer to target memory");
8701 // compare capture
8702 if (V.Var) {
8703 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
8704 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
8705 }
8706
8707 bool IsInteger = E->getType()->isIntegerTy();
8708
8709 if (Op == OMPAtomicCompareOp::EQ) {
8710 AtomicCmpXchgInst *Result = nullptr;
8711 if (!IsInteger) {
8712 IntegerType *IntCastTy =
8713 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
8714 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
8715 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
8716 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
8717 AO, Failure);
8718 } else {
8719 Result =
8720 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
8721 }
8722
8723 if (V.Var) {
8724 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
8725 if (!IsInteger)
8726 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
8727 assert(OldValue->getType() == V.ElemTy &&
8728 "OldValue and V must be of same type");
8729 if (IsPostfixUpdate) {
8730 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
8731 } else {
8732 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8733 if (IsFailOnly) {
8734 // CurBB----
8735 // | |
8736 // v |
8737 // ContBB |
8738 // | |
8739 // v |
8740 // ExitBB <-
8741 //
8742 // where ContBB only contains the store of old value to 'v'.
8744 Instruction *CurBBTI = CurBB->getTerminator();
8745 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8746 BasicBlock *ExitBB = CurBB->splitBasicBlock(
8747 CurBBTI, X.Var->getName() + ".atomic.exit");
8748 BasicBlock *ContBB = CurBB->splitBasicBlock(
8749 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
8750 ContBB->getTerminator()->eraseFromParent();
8751 CurBB->getTerminator()->eraseFromParent();
8752
8753 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
8754
8755 Builder.SetInsertPoint(ContBB);
8756 Builder.CreateStore(OldValue, V.Var);
8757 Builder.CreateBr(ExitBB);
8758
8759 if (UnreachableInst *ExitTI =
8760 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8761 CurBBTI->eraseFromParent();
8762 Builder.SetInsertPoint(ExitBB);
8763 } else {
8764 Builder.SetInsertPoint(ExitTI);
8765 }
8766 } else {
8767 Value *CapturedValue =
8768 Builder.CreateSelect(SuccessOrFail, E, OldValue);
8769 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
8770 }
8771 }
8772 }
8773 // The comparison result has to be stored.
8774 if (R.Var) {
8775 assert(R.Var->getType()->isPointerTy() &&
8776 "r.var must be of pointer type");
8777 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
8778
8779 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8780 Value *ResultCast = R.IsSigned
8781 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
8782 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
8783 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
8784 }
8785 } else {
8786 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
8787 "Op should be either max or min at this point");
8788 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
8789
8790 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
8791 // Let's take max as example.
8792 // OpenMP form:
8793 // x = x > expr ? expr : x;
8794 // LLVM form:
8795 // *ptr = *ptr > val ? *ptr : val;
8796 // We need to transform to LLVM form.
8797 // x = x <= expr ? x : expr;
8799 if (IsXBinopExpr) {
8800 if (IsInteger) {
8801 if (X.IsSigned)
8802 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
8804 else
8805 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
8807 } else {
8808 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
8810 }
8811 } else {
8812 if (IsInteger) {
8813 if (X.IsSigned)
8814 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
8816 else
8817 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
8819 } else {
8820 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
8822 }
8823 }
8824
8825 AtomicRMWInst *OldValue =
8826 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
8827 if (V.Var) {
8828 Value *CapturedValue = nullptr;
8829 if (IsPostfixUpdate) {
8830 CapturedValue = OldValue;
8831 } else {
8832 CmpInst::Predicate Pred;
8833 switch (NewOp) {
8834 case AtomicRMWInst::Max:
8835 Pred = CmpInst::ICMP_SGT;
8836 break;
8838 Pred = CmpInst::ICMP_UGT;
8839 break;
8841 Pred = CmpInst::FCMP_OGT;
8842 break;
8843 case AtomicRMWInst::Min:
8844 Pred = CmpInst::ICMP_SLT;
8845 break;
8847 Pred = CmpInst::ICMP_ULT;
8848 break;
8850 Pred = CmpInst::FCMP_OLT;
8851 break;
8852 default:
8853 llvm_unreachable("unexpected comparison op");
8854 }
8855 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
8856 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
8857 }
8858 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
8859 }
8860 }
8861
8862 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
8863
8864 return Builder.saveIP();
8865}
8866
8869 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
8870 Value *NumTeamsUpper, Value *ThreadLimit,
8871 Value *IfExpr) {
8872 if (!updateToLocation(Loc))
8873 return InsertPointTy();
8874
8875 uint32_t SrcLocStrSize;
8876 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8877 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8878 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
8879
8880 // Outer allocation basicblock is the entry block of the current function.
8881 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
8882 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
8883 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
8884 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
8885 }
8886
8887 // The current basic block is split into four basic blocks. After outlining,
8888 // they will be mapped as follows:
8889 // ```
8890 // def current_fn() {
8891 // current_basic_block:
8892 // br label %teams.exit
8893 // teams.exit:
8894 // ; instructions after teams
8895 // }
8896 //
8897 // def outlined_fn() {
8898 // teams.alloca:
8899 // br label %teams.body
8900 // teams.body:
8901 // ; instructions within teams body
8902 // }
8903 // ```
8904 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
8905 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
8906 BasicBlock *AllocaBB =
8907 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
8908
8909 bool SubClausesPresent =
8910 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
8911 // Push num_teams
8912 if (!Config.isTargetDevice() && SubClausesPresent) {
8913 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
8914 "if lowerbound is non-null, then upperbound must also be non-null "
8915 "for bounds on num_teams");
8916
8917 if (NumTeamsUpper == nullptr)
8918 NumTeamsUpper = Builder.getInt32(0);
8919
8920 if (NumTeamsLower == nullptr)
8921 NumTeamsLower = NumTeamsUpper;
8922
8923 if (IfExpr) {
8924 assert(IfExpr->getType()->isIntegerTy() &&
8925 "argument to if clause must be an integer value");
8926
8927 // upper = ifexpr ? upper : 1
8928 if (IfExpr->getType() != Int1)
8929 IfExpr = Builder.CreateICmpNE(IfExpr,
8930 ConstantInt::get(IfExpr->getType(), 0));
8931 NumTeamsUpper = Builder.CreateSelect(
8932 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
8933
8934 // lower = ifexpr ? lower : 1
8935 NumTeamsLower = Builder.CreateSelect(
8936 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
8937 }
8938
8939 if (ThreadLimit == nullptr)
8940 ThreadLimit = Builder.getInt32(0);
8941
8942 Value *ThreadNum = getOrCreateThreadID(Ident);
8944 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
8945 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
8946 }
8947 // Generate the body of teams.
8948 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
8949 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
8950 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
8951 return Err;
8952
8953 OutlineInfo OI;
8954 OI.EntryBB = AllocaBB;
8955 OI.ExitBB = ExitBB;
8956 OI.OuterAllocaBB = &OuterAllocaBB;
8957
8958 // Insert fake values for global tid and bound tid.
8960 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
8962 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
8964 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
8965
8966 auto HostPostOutlineCB = [this, Ident,
8967 ToBeDeleted](Function &OutlinedFn) mutable {
8968 // The stale call instruction will be replaced with a new call instruction
8969 // for runtime call with the outlined function.
8970
8971 assert(OutlinedFn.getNumUses() == 1 &&
8972 "there must be a single user for the outlined function");
8973 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
8974 ToBeDeleted.push_back(StaleCI);
8975
8976 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
8977 "Outlined function must have two or three arguments only");
8978
8979 bool HasShared = OutlinedFn.arg_size() == 3;
8980
8981 OutlinedFn.getArg(0)->setName("global.tid.ptr");
8982 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
8983 if (HasShared)
8984 OutlinedFn.getArg(2)->setName("data");
8985
8986 // Call to the runtime function for teams in the current function.
8987 assert(StaleCI && "Error while outlining - no CallInst user found for the "
8988 "outlined function.");
8989 Builder.SetInsertPoint(StaleCI);
8990 SmallVector<Value *> Args = {
8991 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
8992 if (HasShared)
8993 Args.push_back(StaleCI->getArgOperand(2));
8995 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
8996 Args);
8997
8998 for (Instruction *I : llvm::reverse(ToBeDeleted))
8999 I->eraseFromParent();
9000 };
9001
9002 if (!Config.isTargetDevice())
9003 OI.PostOutlineCB = HostPostOutlineCB;
9004
9005 addOutlineInfo(std::move(OI));
9006
9007 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
9008
9009 return Builder.saveIP();
9010}
9011
9014 std::string VarName) {
9015 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
9017 Names.size()),
9018 Names);
9019 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
9020 M, MapNamesArrayInit->getType(),
9021 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
9022 VarName);
9023 return MapNamesArrayGlobal;
9024}
9025
9026// Create all simple and struct types exposed by the runtime and remember
9027// the llvm::PointerTypes of them for easy access later.
9028void OpenMPIRBuilder::initializeTypes(Module &M) {
9029 LLVMContext &Ctx = M.getContext();
9030 StructType *T;
9031#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
9032#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
9033 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
9034 VarName##PtrTy = PointerType::getUnqual(VarName##Ty);
9035#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
9036 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
9037 VarName##Ptr = PointerType::getUnqual(VarName);
9038#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
9039 T = StructType::getTypeByName(Ctx, StructName); \
9040 if (!T) \
9041 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
9042 VarName = T; \
9043 VarName##Ptr = PointerType::getUnqual(T);
9044#include "llvm/Frontend/OpenMP/OMPKinds.def"
9045}
9046
9049 SmallVectorImpl<BasicBlock *> &BlockVector) {
9051 BlockSet.insert(EntryBB);
9052 BlockSet.insert(ExitBB);
9053
9054 Worklist.push_back(EntryBB);
9055 while (!Worklist.empty()) {
9056 BasicBlock *BB = Worklist.pop_back_val();
9057 BlockVector.push_back(BB);
9058 for (BasicBlock *SuccBB : successors(BB))
9059 if (BlockSet.insert(SuccBB).second)
9060 Worklist.push_back(SuccBB);
9061 }
9062}
9063
9065 uint64_t Size, int32_t Flags,
9067 StringRef Name) {
9068 if (!Config.isGPU()) {
9070 M, ID, Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0,
9071 "omp_offloading_entries");
9072 return;
9073 }
9074 // TODO: Add support for global variables on the device after declare target
9075 // support.
9076 Function *Fn = dyn_cast<Function>(Addr);
9077 if (!Fn)
9078 return;
9079
9080 Module &M = *(Fn->getParent());
9081 LLVMContext &Ctx = M.getContext();
9082
9083 // Get "nvvm.annotations" metadata node.
9084 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
9085
9086 Metadata *MDVals[] = {
9087 ConstantAsMetadata::get(Fn), MDString::get(Ctx, "kernel"),
9088 ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Ctx), 1))};
9089 // Append metadata to nvvm.annotations.
9090 MD->addOperand(MDNode::get(Ctx, MDVals));
9091
9092 // Add a function attribute for the kernel.
9093 Fn->addFnAttr(Attribute::get(Ctx, "kernel"));
9094 if (T.isAMDGCN())
9095 Fn->addFnAttr("uniform-work-group-size", "true");
9096 Fn->addFnAttr(Attribute::MustProgress);
9097}
9098
9099// We only generate metadata for function that contain target regions.
9102
9103 // If there are no entries, we don't need to do anything.
9105 return;
9106
9110 16>
9111 OrderedEntries(OffloadInfoManager.size());
9112
9113 // Auxiliary methods to create metadata values and strings.
9114 auto &&GetMDInt = [this](unsigned V) {
9115 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
9116 };
9117
9118 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
9119
9120 // Create the offloading info metadata node.
9121 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
9122 auto &&TargetRegionMetadataEmitter =
9123 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
9124 const TargetRegionEntryInfo &EntryInfo,
9126 // Generate metadata for target regions. Each entry of this metadata
9127 // contains:
9128 // - Entry 0 -> Kind of this type of metadata (0).
9129 // - Entry 1 -> Device ID of the file where the entry was identified.
9130 // - Entry 2 -> File ID of the file where the entry was identified.
9131 // - Entry 3 -> Mangled name of the function where the entry was
9132 // identified.
9133 // - Entry 4 -> Line in the file where the entry was identified.
9134 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
9135 // - Entry 6 -> Order the entry was created.
9136 // The first element of the metadata node is the kind.
9137 Metadata *Ops[] = {
9138 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
9139 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
9140 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
9141 GetMDInt(E.getOrder())};
9142
9143 // Save this entry in the right position of the ordered entries array.
9144 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
9145
9146 // Add metadata to the named metadata node.
9147 MD->addOperand(MDNode::get(C, Ops));
9148 };
9149
9150 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
9151
9152 // Create function that emits metadata for each device global variable entry;
9153 auto &&DeviceGlobalVarMetadataEmitter =
9154 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
9155 StringRef MangledName,
9157 // Generate metadata for global variables. Each entry of this metadata
9158 // contains:
9159 // - Entry 0 -> Kind of this type of metadata (1).
9160 // - Entry 1 -> Mangled name of the variable.
9161 // - Entry 2 -> Declare target kind.
9162 // - Entry 3 -> Order the entry was created.
9163 // The first element of the metadata node is the kind.
9164 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
9165 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
9166
9167 // Save this entry in the right position of the ordered entries array.
9168 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
9169 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
9170
9171 // Add metadata to the named metadata node.
9172 MD->addOperand(MDNode::get(C, Ops));
9173 };
9174
9176 DeviceGlobalVarMetadataEmitter);
9177
9178 for (const auto &E : OrderedEntries) {
9179 assert(E.first && "All ordered entries must exist!");
9180 if (const auto *CE =
9181 dyn_cast<OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion>(
9182 E.first)) {
9183 if (!CE->getID() || !CE->getAddress()) {
9184 // Do not blame the entry if the parent funtion is not emitted.
9185 TargetRegionEntryInfo EntryInfo = E.second;
9186 StringRef FnName = EntryInfo.ParentName;
9187 if (!M.getNamedValue(FnName))
9188 continue;
9189 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
9190 continue;
9191 }
9192 createOffloadEntry(CE->getID(), CE->getAddress(),
9193 /*Size=*/0, CE->getFlags(),
9195 } else if (const auto *CE = dyn_cast<
9197 E.first)) {
9200 CE->getFlags());
9201 switch (Flags) {
9205 continue;
9206 if (!CE->getAddress()) {
9207 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
9208 continue;
9209 }
9210 // The vaiable has no definition - no need to add the entry.
9211 if (CE->getVarSize() == 0)
9212 continue;
9213 break;
9215 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
9216 (!Config.isTargetDevice() && CE->getAddress())) &&
9217 "Declaret target link address is set.");
9218 if (Config.isTargetDevice())
9219 continue;
9220 if (!CE->getAddress()) {
9222 continue;
9223 }
9224 break;
9225 default:
9226 break;
9227 }
9228
9229 // Hidden or internal symbols on the device are not externally visible.
9230 // We should not attempt to register them by creating an offloading
9231 // entry. Indirect variables are handled separately on the device.
9232 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
9233 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
9235 continue;
9236
9237 // Indirect globals need to use a special name that doesn't match the name
9238 // of the associated host global.
9240 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
9241 Flags, CE->getLinkage(), CE->getVarName());
9242 else
9243 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
9244 Flags, CE->getLinkage());
9245
9246 } else {
9247 llvm_unreachable("Unsupported entry kind.");
9248 }
9249 }
9250
9251 // Emit requires directive globals to a special entry so the runtime can
9252 // register them when the device image is loaded.
9253 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
9254 // entries should be redesigned to better suit this use-case.
9258 /*Name=*/"",
9260 Config.getRequiresFlags(), "omp_offloading_entries");
9261}
9262
9264 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
9265 unsigned FileID, unsigned Line, unsigned Count) {
9267 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
9268 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
9269 if (Count)
9270 OS << "_" << Count;
9271}
9272
9275 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
9277 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
9278 EntryInfo.Line, NewCount);
9279}
9280
9283 StringRef ParentName) {
9285 auto FileIDInfo = CallBack();
9286 if (auto EC = sys::fs::getUniqueID(std::get<0>(FileIDInfo), ID)) {
9287 report_fatal_error(("Unable to get unique ID for file, during "
9288 "getTargetEntryUniqueInfo, error message: " +
9289 EC.message())
9290 .c_str());
9291 }
9292
9293 return TargetRegionEntryInfo(ParentName, ID.getDevice(), ID.getFile(),
9294 std::get<1>(FileIDInfo));
9295}
9296
9298 unsigned Offset = 0;
9299 for (uint64_t Remain =
9300 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
9302 !(Remain & 1); Remain = Remain >> 1)
9303 Offset++;
9304 return Offset;
9305}
9306
9309 // Rotate by getFlagMemberOffset() bits.
9310 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
9311 << getFlagMemberOffset());
9312}
9313
9316 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
9317 // If the entry is PTR_AND_OBJ but has not been marked with the special
9318 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
9319 // marked as MEMBER_OF.
9320 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
9322 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
9325 return;
9326
9327 // Reset the placeholder value to prepare the flag for the assignment of the
9328 // proper MEMBER_OF value.
9329 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
9330 Flags |= MemberOfFlag;
9331}
9332
9336 bool IsDeclaration, bool IsExternallyVisible,
9337 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
9338 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
9339 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
9340 std::function<Constant *()> GlobalInitializer,
9341 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
9342 // TODO: convert this to utilise the IRBuilder Config rather than
9343 // a passed down argument.
9344 if (OpenMPSIMD)
9345 return nullptr;
9346
9349 CaptureClause ==
9352 SmallString<64> PtrName;
9353 {
9354 raw_svector_ostream OS(PtrName);
9355 OS << MangledName;
9356 if (!IsExternallyVisible)
9357 OS << format("_%x", EntryInfo.FileID);
9358 OS << "_decl_tgt_ref_ptr";
9359 }
9360
9361 Value *Ptr = M.getNamedValue(PtrName);
9362
9363 if (!Ptr) {
9364 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
9365 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
9366
9367 auto *GV = cast<GlobalVariable>(Ptr);
9368 GV->setLinkage(GlobalValue::WeakAnyLinkage);
9369
9370 if (!Config.isTargetDevice()) {
9371 if (GlobalInitializer)
9372 GV->setInitializer(GlobalInitializer());
9373 else
9374 GV->setInitializer(GlobalValue);
9375 }
9376
9378 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
9379 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
9380 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
9381 }
9382
9383 return cast<Constant>(Ptr);
9384 }
9385
9386 return nullptr;
9387}
9388
9392 bool IsDeclaration, bool IsExternallyVisible,
9393 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
9394 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
9395 std::vector<Triple> TargetTriple,
9396 std::function<Constant *()> GlobalInitializer,
9397 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
9398 Constant *Addr) {
9400 (TargetTriple.empty() && !Config.isTargetDevice()))
9401 return;
9402
9404 StringRef VarName;
9405 int64_t VarSize;
9407
9409 CaptureClause ==
9413 VarName = MangledName;
9414 GlobalValue *LlvmVal = M.getNamedValue(VarName);
9415
9416 if (!IsDeclaration)
9417 VarSize = divideCeil(
9419 else
9420 VarSize = 0;
9421 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
9422
9423 // This is a workaround carried over from Clang which prevents undesired
9424 // optimisation of internal variables.
9425 if (Config.isTargetDevice() &&
9426 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
9427 // Do not create a "ref-variable" if the original is not also available
9428 // on the host.
9430 return;
9431
9432 std::string RefName = createPlatformSpecificName({VarName, "ref"});
9433
9434 if (!M.getNamedValue(RefName)) {
9435 Constant *AddrRef =
9436 getOrCreateInternalVariable(Addr->getType(), RefName);
9437 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
9438 GvAddrRef->setConstant(true);
9439 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
9440 GvAddrRef->setInitializer(Addr);
9441 GeneratedRefs.push_back(GvAddrRef);
9442 }
9443 }
9444 } else {
9447 else
9449
9450 if (Config.isTargetDevice()) {
9451 VarName = (Addr) ? Addr->getName() : "";
9452 Addr = nullptr;
9453 } else {
9455 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
9456 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
9457 LlvmPtrTy, GlobalInitializer, VariableLinkage);
9458 VarName = (Addr) ? Addr->getName() : "";
9459 }
9460 VarSize = M.getDataLayout().getPointerSize();
9462 }
9463
9465 Flags, Linkage);
9466}
9467
9468/// Loads all the offload entries information from the host IR
9469/// metadata.
9471 // If we are in target mode, load the metadata from the host IR. This code has
9472 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
9473
9475 if (!MD)
9476 return;
9477
9478 for (MDNode *MN : MD->operands()) {
9479 auto &&GetMDInt = [MN](unsigned Idx) {
9480 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
9481 return cast<ConstantInt>(V->getValue())->getZExtValue();
9482 };
9483
9484 auto &&GetMDString = [MN](unsigned Idx) {
9485 auto *V = cast<MDString>(MN->getOperand(Idx));
9486 return V->getString();
9487 };
9488
9489 switch (GetMDInt(0)) {
9490 default:
9491 llvm_unreachable("Unexpected metadata!");
9492 break;
9495 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
9496 /*DeviceID=*/GetMDInt(1),
9497 /*FileID=*/GetMDInt(2),
9498 /*Line=*/GetMDInt(4),
9499 /*Count=*/GetMDInt(5));
9501 /*Order=*/GetMDInt(6));
9502 break;
9503 }
9507 /*MangledName=*/GetMDString(1),
9509 /*Flags=*/GetMDInt(2)),
9510 /*Order=*/GetMDInt(3));
9511 break;
9512 }
9513 }
9514}
9515
9517 if (HostFilePath.empty())
9518 return;
9519
9520 auto Buf = MemoryBuffer::getFile(HostFilePath);
9521 if (std::error_code Err = Buf.getError()) {
9522 report_fatal_error(("error opening host file from host file path inside of "
9523 "OpenMPIRBuilder: " +
9524 Err.message())
9525 .c_str());
9526 }
9527
9528 LLVMContext Ctx;
9530 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
9531 if (std::error_code Err = M.getError()) {
9533 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
9534 .c_str());
9535 }
9536
9537 loadOffloadInfoMetadata(*M.get());
9538}
9539
9540//===----------------------------------------------------------------------===//
9541// OffloadEntriesInfoManager
9542//===----------------------------------------------------------------------===//
9543
9545 return OffloadEntriesTargetRegion.empty() &&
9546 OffloadEntriesDeviceGlobalVar.empty();
9547}
9548
9549unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
9550 const TargetRegionEntryInfo &EntryInfo) const {
9551 auto It = OffloadEntriesTargetRegionCount.find(
9552 getTargetRegionEntryCountKey(EntryInfo));
9553 if (It == OffloadEntriesTargetRegionCount.end())
9554 return 0;
9555 return It->second;
9556}
9557
9558void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
9559 const TargetRegionEntryInfo &EntryInfo) {
9560 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
9561 EntryInfo.Count + 1;
9562}
9563
9564/// Initialize target region entry.
9566 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
9567 OffloadEntriesTargetRegion[EntryInfo] =
9568 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
9569 OMPTargetRegionEntryTargetRegion);
9570 ++OffloadingEntriesNum;
9571}
9572
9576 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
9577
9578 // Update the EntryInfo with the next available count for this location.
9579 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
9580
9581 // If we are emitting code for a target, the entry is already initialized,
9582 // only has to be registered.
9583 if (OMPBuilder->Config.isTargetDevice()) {
9584 // This could happen if the device compilation is invoked standalone.
9585 if (!hasTargetRegionEntryInfo(EntryInfo)) {
9586 return;
9587 }
9588 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
9589 Entry.setAddress(Addr);
9590 Entry.setID(ID);
9591 Entry.setFlags(Flags);
9592 } else {
9594 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
9595 return;
9596 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
9597 "Target region entry already registered!");
9598 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
9599 OffloadEntriesTargetRegion[EntryInfo] = Entry;
9600 ++OffloadingEntriesNum;
9601 }
9602 incrementTargetRegionEntryInfoCount(EntryInfo);
9603}
9604
9606 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
9607
9608 // Update the EntryInfo with the next available count for this location.
9609 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
9610
9611 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
9612 if (It == OffloadEntriesTargetRegion.end()) {
9613 return false;
9614 }
9615 // Fail if this entry is already registered.
9616 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
9617 return false;
9618 return true;
9619}
9620
9622 const OffloadTargetRegionEntryInfoActTy &Action) {
9623 // Scan all target region entries and perform the provided action.
9624 for (const auto &It : OffloadEntriesTargetRegion) {
9625 Action(It.first, It.second);
9626 }
9627}
9628
9630 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
9631 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
9632 ++OffloadingEntriesNum;
9633}
9634
9636 StringRef VarName, Constant *Addr, int64_t VarSize,
9638 if (OMPBuilder->Config.isTargetDevice()) {
9639 // This could happen if the device compilation is invoked standalone.
9640 if (!hasDeviceGlobalVarEntryInfo(VarName))
9641 return;
9642 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
9643 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
9644 if (Entry.getVarSize() == 0) {
9645 Entry.setVarSize(VarSize);
9646 Entry.setLinkage(Linkage);
9647 }
9648 return;
9649 }
9650 Entry.setVarSize(VarSize);
9651 Entry.setLinkage(Linkage);
9652 Entry.setAddress(Addr);
9653 } else {
9654 if (hasDeviceGlobalVarEntryInfo(VarName)) {
9655 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
9656 assert(Entry.isValid() && Entry.getFlags() == Flags &&
9657 "Entry not initialized!");
9658 if (Entry.getVarSize() == 0) {
9659 Entry.setVarSize(VarSize);
9660 Entry.setLinkage(Linkage);
9661 }
9662 return;
9663 }
9665 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
9666 Addr, VarSize, Flags, Linkage,
9667 VarName.str());
9668 else
9669 OffloadEntriesDeviceGlobalVar.try_emplace(
9670 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
9671 ++OffloadingEntriesNum;
9672 }
9673}
9674
9677 // Scan all target region entries and perform the provided action.
9678 for (const auto &E : OffloadEntriesDeviceGlobalVar)
9679 Action(E.getKey(), E.getValue());
9680}
9681
9682//===----------------------------------------------------------------------===//
9683// CanonicalLoopInfo
9684//===----------------------------------------------------------------------===//
9685
9686void CanonicalLoopInfo::collectControlBlocks(
9688 // We only count those BBs as control block for which we do not need to
9689 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
9690 // flow. For consistency, this also means we do not add the Body block, which
9691 // is just the entry to the body code.
9692 BBs.reserve(BBs.size() + 6);
9693 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
9694}
9695
9697 assert(isValid() && "Requires a valid canonical loop");
9698 for (BasicBlock *Pred : predecessors(Header)) {
9699 if (Pred != Latch)
9700 return Pred;
9701 }
9702 llvm_unreachable("Missing preheader");
9703}
9704
9705void CanonicalLoopInfo::setTripCount(Value *TripCount) {
9706 assert(isValid() && "Requires a valid canonical loop");
9707
9708 Instruction *CmpI = &getCond()->front();
9709 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
9710 CmpI->setOperand(1, TripCount);
9711
9712#ifndef NDEBUG
9713 assertOK();
9714#endif
9715}
9716
9717void CanonicalLoopInfo::mapIndVar(
9718 llvm::function_ref<Value *(Instruction *)> Updater) {
9719 assert(isValid() && "Requires a valid canonical loop");
9720
9721 Instruction *OldIV = getIndVar();
9722
9723 // Record all uses excluding those introduced by the updater. Uses by the
9724 // CanonicalLoopInfo itself to keep track of the number of iterations are
9725 // excluded.
9726 SmallVector<Use *> ReplacableUses;
9727 for (Use &U : OldIV->uses()) {
9728 auto *User = dyn_cast<Instruction>(U.getUser());
9729 if (!User)
9730 continue;
9731 if (User->getParent() == getCond())
9732 continue;
9733 if (User->getParent() == getLatch())
9734 continue;
9735 ReplacableUses.push_back(&U);
9736 }
9737
9738 // Run the updater that may introduce new uses
9739 Value *NewIV = Updater(OldIV);
9740
9741 // Replace the old uses with the value returned by the updater.
9742 for (Use *U : ReplacableUses)
9743 U->set(NewIV);
9744
9745#ifndef NDEBUG
9746 assertOK();
9747#endif
9748}
9749
9751#ifndef NDEBUG
9752 // No constraints if this object currently does not describe a loop.
9753 if (!isValid())
9754 return;
9755
9756 BasicBlock *Preheader = getPreheader();
9757 BasicBlock *Body = getBody();
9758 BasicBlock *After = getAfter();
9759
9760 // Verify standard control-flow we use for OpenMP loops.
9761 assert(Preheader);
9762 assert(isa<BranchInst>(Preheader->getTerminator()) &&
9763 "Preheader must terminate with unconditional branch");
9764 assert(Preheader->getSingleSuccessor() == Header &&
9765 "Preheader must jump to header");
9766
9767 assert(Header);
9768 assert(isa<BranchInst>(Header->getTerminator()) &&
9769 "Header must terminate with unconditional branch");
9770 assert(Header->getSingleSuccessor() == Cond &&
9771 "Header must jump to exiting block");
9772
9773 assert(Cond);
9774 assert(Cond->getSinglePredecessor() == Header &&
9775 "Exiting block only reachable from header");
9776
9777 assert(isa<BranchInst>(Cond->getTerminator()) &&
9778 "Exiting block must terminate with conditional branch");
9779 assert(size(successors(Cond)) == 2 &&
9780 "Exiting block must have two successors");
9781 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
9782 "Exiting block's first successor jump to the body");
9783 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
9784 "Exiting block's second successor must exit the loop");
9785
9786 assert(Body);
9787 assert(Body->getSinglePredecessor() == Cond &&
9788 "Body only reachable from exiting block");
9789 assert(!isa<PHINode>(Body->front()));
9790
9791 assert(Latch);
9792 assert(isa<BranchInst>(Latch->getTerminator()) &&
9793 "Latch must terminate with unconditional branch");
9794 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
9795 // TODO: To support simple redirecting of the end of the body code that has
9796 // multiple; introduce another auxiliary basic block like preheader and after.
9797 assert(Latch->getSinglePredecessor() != nullptr);
9798 assert(!isa<PHINode>(Latch->front()));
9799
9800 assert(Exit);
9801 assert(isa<BranchInst>(Exit->getTerminator()) &&
9802 "Exit block must terminate with unconditional branch");
9803 assert(Exit->getSingleSuccessor() == After &&
9804 "Exit block must jump to after block");
9805
9806 assert(After);
9807 assert(After->getSinglePredecessor() == Exit &&
9808 "After block only reachable from exit block");
9809 assert(After->empty() || !isa<PHINode>(After->front()));
9810
9811 Instruction *IndVar = getIndVar();
9812 assert(IndVar && "Canonical induction variable not found?");
9813 assert(isa<IntegerType>(IndVar->getType()) &&
9814 "Induction variable must be an integer");
9815 assert(cast<PHINode>(IndVar)->getParent() == Header &&
9816 "Induction variable must be a PHI in the loop header");
9817 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
9818 assert(
9819 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
9820 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
9821
9822 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
9823 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
9824 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
9825 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
9826 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
9827 ->isOne());
9828
9829 Value *TripCount = getTripCount();
9830 assert(TripCount && "Loop trip count not found?");
9831 assert(IndVar->getType() == TripCount->getType() &&
9832 "Trip count and induction variable must have the same type");
9833
9834 auto *CmpI = cast<CmpInst>(&Cond->front());
9835 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
9836 "Exit condition must be a signed less-than comparison");
9837 assert(CmpI->getOperand(0) == IndVar &&
9838 "Exit condition must compare the induction variable");
9839 assert(CmpI->getOperand(1) == TripCount &&
9840 "Exit condition must compare with the trip count");
9841#endif
9842}
9843
9845 Header = nullptr;
9846 Cond = nullptr;
9847 Latch = nullptr;
9848 Exit = nullptr;
9849}
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE() pulls the operator overloads used by LLVM_MARK_AS_BITMASK_EN...
Definition: BitmaskEnum.h:83
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:42
BlockVerifier::State From
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition: IVUsers.cpp:48
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:533
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file contains the declarations for metadata subclasses.
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Definition: OMPConstants.h:75
Provides definitions for Target specific Grid Values.
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI)
Create an entry point for a target task with the following.
static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause)
Determine the schedule type using schedule and ordering clause arguments.
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static MDNode * getNVPTXMDNode(Function &Kernel, StringRef Name)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, Type *ParallelTaskPtr, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, Function *OutlinedFn, Constant *OutlinedFnID, ArrayRef< int32_t > NumTeams, ArrayRef< int32_t > NumThreads, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, SmallVector< llvm::OpenMPIRBuilder::DependData > Dependencies={}, bool HasNoWait=false)
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Definition: Instructions.h:63
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:124
PointerType * getType() const
Overload to return most specific pointer type.
Definition: Instructions.h:99
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:117
unsigned getAddressSpace() const
Return the address space for the allocation.
Definition: Instructions.h:104
std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
Definition: Instructions.h:128
const Value * getArraySize() const
Get the number of elements allocated.
Definition: Instructions.h:95
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
Definition: PassManager.h:471
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:157
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
Class to represent array types.
Definition: DerivedTypes.h:395
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:501
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:652
std::pair< LoadInst *, AllocaInst * > EmitAtomicLoadLibcall(AtomicOrdering AO)
Definition: Atomic.cpp:107
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:704
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:716
@ Add
*p = old + v
Definition: Instructions.h:720
@ FAdd
*p = old + v
Definition: Instructions.h:741
@ USubCond
Subtract only if no unsigned overflow.
Definition: Instructions.h:764
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:734
@ Or
*p = old | v
Definition: Instructions.h:728
@ Sub
*p = old - v
Definition: Instructions.h:722
@ And
*p = old & v
Definition: Instructions.h:724
@ Xor
*p = old ^ v
Definition: Instructions.h:730
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
Definition: Instructions.h:768
@ FSub
*p = old - v
Definition: Instructions.h:744
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:756
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:732
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:738
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:752
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:736
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:748
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:760
@ Nand
*p = ~(old & v)
Definition: Instructions.h:726
AttrBuilder & addAttribute(Attribute::AttrKind Val)
Add an attribute to the builder.
AttrBuilder & removeAttribute(Attribute::AttrKind Val)
Remove an attribute from the builder.
AttributeSet getFnAttrs() const
The function attributes are returned.
AttributeList addFnAttributes(LLVMContext &C, const AttrBuilder &B) const
Add function attribute to the list.
Definition: Attributes.h:595
AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
Definition: Attributes.cpp:920
AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
Definition: Attributes.cpp:905
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:95
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:392
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
Definition: BasicBlock.cpp:662
iterator end()
Definition: BasicBlock.h:461
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:416
reverse_iterator rbegin()
Definition: BasicBlock.h:464
bool empty() const
Definition: BasicBlock.h:470
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:367
const Instruction & front() const
Definition: BasicBlock.h:471
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:212
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:577
const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
Definition: BasicBlock.cpp:497
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:459
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:179
const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
Definition: BasicBlock.cpp:467
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:489
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
Definition: BasicBlock.cpp:279
reverse_iterator rend()
Definition: BasicBlock.h:466
const Instruction * getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
Definition: BasicBlock.cpp:386
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition: BasicBlock.h:376
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition: BasicBlock.h:631
const Instruction & back() const
Definition: BasicBlock.h:473
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:292
void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Definition: BasicBlock.cpp:516
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
void setDoesNotThrow()
Definition: InstrTypes.h:1924
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1349
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition: InstrTypes.h:1269
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1294
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Definition: InstrTypes.h:1275
unsigned arg_size() const
Definition: InstrTypes.h:1292
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:679
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:677
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
A cache for the CodeExtractor analysis.
Definition: CodeExtractor.h:46
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:85
void findAllocas(const CodeExtractorAnalysisCache &CEAC, ValueSet &SinkCands, ValueSet &HoistCands, BasicBlock *&ExitBlock) const
Find the set of allocas whose life ranges are contained within the outlined region.
Function * extractCodeRegion(const CodeExtractorAnalysisCache &CEAC)
Perform the extraction, returning the new function.
void findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, const ValueSet &Allocas, bool CollectGlobalInputs=false) const
Compute the set of input values and output values for the code.
bool isEligible() const
Test whether this code extractor is eligible.
void excludeArgFromAggregate(Value *Arg)
Exclude a value from aggregate argument passing when extracting a code region, passing it instead as ...
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1312
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:528
static Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
Definition: Constants.cpp:2990
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition: Constants.h:709
static Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
Definition: Constants.cpp:2253
static Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
Definition: Constants.cpp:2268
static Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2333
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:866
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.h:126
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:873
static ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
Definition: Constants.cpp:1826
static Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1378
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
DISubprogram * getSubprogram() const
Get the subprogram for this scope.
Debug location.
Subprogram description.
DISPFlags
Debug info subprogram flags.
Type array for a subprogram.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
unsigned getDefaultGlobalsAddressSpace() const
Definition: DataLayout.h:247
Align getABIIntegerTypeAlignment(unsigned BitWidth) const
Returns the minimum ABI-required alignment for an integer type of the specified bitwidth.
Definition: DataLayout.h:486
unsigned getAllocaAddrSpace() const
Definition: DataLayout.h:229
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:457
unsigned getPointerSize(unsigned AS=0) const
Layout pointer size in bytes, rounded up to a whole number of bytes.
Definition: DataLayout.cpp:739
unsigned getIndexSizeInBits(unsigned AS) const
Size in bits of index used for address calculation in getelementptr.
Definition: DataLayout.h:369
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:421
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:847
A debug info location.
Definition: DebugLoc.h:33
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Definition: Dominators.cpp:371
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Lightweight error class with error context and mandatory checking.
Definition: Error.h:160
static ErrorSuccess success()
Create a success value.
Definition: Error.h:337
Tagged union holding either a T or a Error.
Definition: Error.h:481
Error takeError()
Take ownership of the stored error.
Definition: Error.h:608
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:170
Class to represent function types.
Definition: DerivedTypes.h:105
static FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition: Function.cpp:641
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition: Function.h:173
const BasicBlock & getEntryBlock() const
Definition: Function.h:809
bool empty() const
Definition: Function.h:859
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:216
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition: Function.cpp:454
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:373
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:766
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:778
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:353
const Function & getFunction() const
Definition: Function.h:171
iterator begin()
Definition: Function.h:853
arg_iterator arg_begin()
Definition: Function.h:868
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition: Function.h:356
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition: Function.cpp:669
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition: Function.h:754
size_t arg_size() const
Definition: Function.h:901
Type * getReturnType() const
Returns the type of the ret val.
Definition: Function.h:221
iterator end()
Definition: Function.h:855
void setCallingConv(CallingConv::ID CC)
Definition: Function.h:281
Argument * getArg(unsigned i) const
Definition: Function.h:886
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition: Value.h:589
void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
Definition: Metadata.cpp:1565
LinkageTypes getLinkage() const
Definition: GlobalValue.h:546
void setLinkage(LinkageTypes LT)
Definition: GlobalValue.h:537
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
void setDSOLocal(bool Local)
Definition: GlobalValue.h:303
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:294
@ HiddenVisibility
The GV is hidden.
Definition: GlobalValue.h:68
@ ProtectedVisibility
The GV is protected.
Definition: GlobalValue.h:69
void setVisibility(VisibilityTypes V)
Definition: GlobalValue.h:254
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition: GlobalValue.h:51
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:60
@ CommonLinkage
Tentative definitions.
Definition: GlobalValue.h:62
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:59
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:57
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition: GlobalValue.h:56
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition: GlobalValue.h:58
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:55
Type * getValueType() const
Definition: GlobalValue.h:296
InsertPoint - A saved insertion point.
Definition: IRBuilder.h:271
BasicBlock * getBlock() const
Definition: IRBuilder.h:286
bool isSet() const
Returns true if this insert point is set.
Definition: IRBuilder.h:284
BasicBlock::iterator getPoint() const
Definition: IRBuilder.h:287
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:108
Value * CreateNUWMul(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1411
Value * CreatePtrDiff(Type *ElemTy, Value *LHS, Value *RHS, const Twine &Name="")
Return the i64 difference between two pointer values, dividing out the size of the pointed-to objects...
Definition: IRBuilder.cpp:1065
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2280
AtomicCmpXchgInst * CreateAtomicCmpXchg(Value *Ptr, Value *Cmp, Value *New, MaybeAlign Align, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1843
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1881
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
Definition: IRBuilder.h:1775
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2556
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:553
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2288
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1809
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2045
UnreachableInst * CreateUnreachable()
Definition: IRBuilder.h:1300
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2193
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2549
CallInst * CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue, unsigned Alignment, Value *OffsetValue=nullptr)
Create an assume intrinsic call that represents an alignment assumption on the provided pointer.
Definition: IRBuilder.cpp:1255
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1043
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:189
Value * CreateStructGEP(Type *Ty, Value *Ptr, unsigned Idx, const Twine &Name="")
Definition: IRBuilder.h:1974
IntegerType * getIndexTy(const DataLayout &DL, unsigned AddrSpace)
Fetch the type of an integer that should be used to index GEP operations within AddressSpace.
Definition: IRBuilder.h:595
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2039
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2141
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:540
Value * CreateNSWAdd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1373
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:188
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:234
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:545
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition: IRBuilder.h:1876
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2205
Value * CreateUDiv(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1415
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2268
Value * CreateNUWAdd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1377
IntegerType * getInt16Ty()
Fetch the type representing a 16-bit integer.
Definition: IRBuilder.h:535
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1868
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:505
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition: IRBuilder.h:1727
InsertPoint saveIP() const
Returns the current insert point.
Definition: IRBuilder.h:291
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:500
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2398
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2429
SwitchInst * CreateSwitch(Value *V, BasicBlock *Dest, unsigned NumCases=10, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a switch instruction with the specified value, default dest, and with a hint for the number of...
Definition: IRBuilder.h:1181
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2264
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:159
DebugLoc getCurrentDebugLocation() const
Get location information used by debugging information.
Definition: IRBuilder.cpp:64
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1381
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2146
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition: IRBuilder.h:511
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1158
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1792
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1453
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2027
LLVMContext & getContext() const
Definition: IRBuilder.h:190
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1512
ReturnInst * CreateRetVoid()
Create a 'ret void' instruction.
Definition: IRBuilder.h:1128
Value * CreateConstInBoundsGEP2_32(Type *Ty, Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name="")
Definition: IRBuilder.h:1915
Value * CreateConstInBoundsGEP2_64(Type *Ty, Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name="")
Definition: IRBuilder.h:1961
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1805
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1364
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2136
Value * CreateExactUDiv(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1424
Value * CreateIsNotNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg != 0.
Definition: IRBuilder.h:2582
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2443
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1856
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2013
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1534
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:583
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1152
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:183
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2296
ConstantInt * getInt16(uint16_t C)
Get a constant 16-bit value.
Definition: IRBuilder.h:495
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2276
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2219
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition: IRBuilder.h:303
Value * CreateIsNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg == 0.
Definition: IRBuilder.h:2577
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:194
Type * getVoidTy()
Fetch the type representing void.
Definition: IRBuilder.h:578
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1828
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1493
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1556
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2374
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:530
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1441
CallInst * CreateMemCpy(Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, uint64_t Size, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *TBAAStructTag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memcpy between the specified pointers.
Definition: IRBuilder.h:672
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2060
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2151
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1398
GlobalVariable * CreateGlobalString(StringRef Str, const Twine &Name="", unsigned AddressSpace=0, Module *M=nullptr, bool AddNull=true)
Make a new global variable with initializer type i8*.
Definition: IRBuilder.cpp:44
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2699
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:80
void moveBeforePreserving(Instruction *MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:475
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:68
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
static bool classof(const Value *V)
Methods for support type inquiry through isa, cast, and dyn_cast:
Definition: Instruction.h:951
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:390
BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1679
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:472
void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
Definition: DerivedTypes.h:42
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:241
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:211
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition: LoopInfo.cpp:969
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Definition: MDBuilder.cpp:118
Metadata node.
Definition: Metadata.h:1069
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1077
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1553
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1430
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1428
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1545
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:606
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type size() const
Definition: MapVector.h:60
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
NamedMDNode * getNamedMetadata(StringRef Name) const
Return the first NamedMDNode in the module with the specified name.
Definition: Module.cpp:297
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:302
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:228
StringRef getName() const
Get a short "name" for the module.
Definition: Module.h:285
const std::string & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition: Module.h:298
iterator_range< global_iterator > globals()
Definition: Module.h:702
const FunctionListType & getFunctionList() const
Get the Module's list of functions (constant).
Definition: Module.h:614
GlobalVariable * getGlobalVariable(StringRef Name) const
Look up the specified global variable in the module symbol table.
Definition: Module.h:447
GlobalValue * getNamedValue(StringRef Name) const
Return the global value in the module with the specified name, of arbitrary type.
Definition: Module.cpp:170
NamedMDNode * getOrInsertNamedMetadata(StringRef Name)
Return the named MDNode in the module with the specified name.
Definition: Module.cpp:304
const GlobalVariable * getNamedGlobal(StringRef Name) const
Return the global variable in the module with the specified name, of arbitrary type.
Definition: Module.h:462
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:294
A tuple of MDNodes.
Definition: Metadata.h:1733
iterator_range< op_iterator > operands()
Definition: Metadata.h:1829
void addOperand(MDNode *M)
Definition: Metadata.cpp:1431
@ OffloadingEntryInfoTargetRegion
Entry is a target region.
Definition: OMPIRBuilder.h:244
@ OffloadingEntryInfoDeviceGlobalVar
Entry is a declare target variable.
Definition: OMPIRBuilder.h:246
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
Definition: OMPIRBuilder.h:377
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
Definition: OMPIRBuilder.h:379
void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
Definition: OMPIRBuilder.h:297
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
Definition: OMPIRBuilder.h:299
void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
unsigned size() const
Return number of entries defined so far.
Definition: OMPIRBuilder.h:288
void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
Definition: OMPIRBuilder.h:357
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
Definition: OMPIRBuilder.h:363
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
Definition: OMPIRBuilder.h:369
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
Definition: OMPIRBuilder.h:367
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
Definition: OMPIRBuilder.h:361
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
Definition: OMPIRBuilder.h:359
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
Definition: OMPIRBuilder.h:433
bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
Definition: OMPIRBuilder.h:93
void setGridValue(omp::GV G)
Definition: OMPIRBuilder.h:189
StringRef separator() const
Definition: OMPIRBuilder.h:175
int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
StringRef firstSeparator() const
Definition: OMPIRBuilder.h:165
std::optional< bool > EmitLLVMUsedMetaInfo
Flag for specifying if LLVMUsed information should be emitted.
Definition: OMPIRBuilder.h:106
omp::GV getGridValue() const
Definition: OMPIRBuilder.h:148
void setHasRequiresReverseOffload(bool Value)
bool hasRequiresUnifiedSharedMemory() const
void setHasRequiresUnifiedSharedMemory(bool Value)
bool hasRequiresDynamicAllocators() const
bool openMPOffloadMandatory() const
Definition: OMPIRBuilder.h:142
void setHasRequiresUnifiedAddress(bool Value)
void setHasRequiresDynamicAllocators(bool Value)
void setEmitLLVMUsed(bool Value=true)
Definition: OMPIRBuilder.h:185
bool hasRequiresReverseOffload() const
bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
Definition: OMPIRBuilder.h:474
InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for '#omp ordered [threads | simd]'.
Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
InsertPointOrErrorTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for '#omp cancel'.
InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD, int32_t MinThreadsVal=0, int32_t MaxThreadsVal=0, int32_t MinTeamsVal=0, int32_t MaxTeamsVal=0)
The omp target interface.
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
void createTaskyield(const LocationDescription &Loc)
Generator for '#omp taskyield'.
std::function< Error(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
Definition: OMPIRBuilder.h:543
void emitBranch(BasicBlock *Target)
InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO)
Emit atomic write for : X = Expr — Only Scalar data types.
static void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
static TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
InsertPointOrErrorTy emitTargetTask(TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait)
Generate a target-task for the target construct.
void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
static std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO)
Emit atomic Read for : V = X — Only Scalar data types.
Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
InsertPointOrErrorTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for '#omp single'.
InsertPointOrErrorTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
void createTaskwait(const LocationDescription &Loc)
Generator for '#omp taskwait'.
CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
InsertPointOrErrorTy createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for '#omp parallel'.
omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
Error emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
InsertPointOrErrorTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, ArrayRef< int32_t > NumTeams, ArrayRef< int32_t > NumThreads, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, SmallVector< DependData > Dependencies={}, bool HasNowait=false)
Generator for '#omp target'.
void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?...
InsertPointOrErrorTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for '#omp ordered depend (source | sink)'.
InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
void emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_Alloc.
void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
InsertPointOrErrorTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp section'.
void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
void emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
InsertPointOrErrorTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp master'.
Error emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective, FinalizeCallbackTy ExitCB={})
Generate control flow and cleanup for cancellation.
InsertPointOrErrorTy emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, unsigned AddressSpace=0)
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
InsertPointOrErrorTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, bool IsNoWait=false, bool IsTeamsReduction=false, bool HasDistribute=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
Function * emitUserDefinedMapper(function_ref< MapInfosTy &(InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)> PrivAndGenMapInfoCB, llvm::Type *ElemTy, StringRef FuncName, function_ref< bool(unsigned int, Function **)> CustomMapperCB=nullptr)
Emit the user-defined mapper function.
CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
Definition: OMPIRBuilder.h:520
InsertPointOrErrorTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false)
Generator for '#omp reduction'.
GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
InsertPointOrErrorTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, SmallVector< DependData > Dependencies={}, bool Mergeable=false, Value *EventHandle=nullptr)
Generator for #omp task
std::function< Expected< Function * >(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
static void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
InsertPointOrErrorTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
InsertPointOrErrorTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for '#omp critical'.
void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
InsertPointOrErrorTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop)
Modifies the canonical loop to be a workshare loop.
void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
InsertPointOrErrorTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for '#omp masked'.
Expected< CanonicalLoopInfo * > createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
InsertPointOrErrorTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointOrErrorTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for '#omp target data'.
BodyGenTy
Type of BodyGen to use for region codegen.
InsertPointOrErrorTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
Expected< InsertPointTy > InsertPointOrErrorTy
Type used to represent an insertion point or an error value.
Definition: OMPIRBuilder.h:523
InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
InsertPointOrErrorTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for '#omp sections'.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
void createFlush(const LocationDescription &Loc)
Generator for '#omp flush'.
Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Class to represent pointers.
Definition: DerivedTypes.h:670
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:686
Analysis pass that exposes the ScalarEvolution for a function.
ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
A vector that has set insertion semantics.
Definition: SetVector.h:57
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition: SetVector.h:237
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition: SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
Definition: SmallString.h:254
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void reserve(size_type N)
Definition: SmallVector.h:663
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
void setAlignment(Align Align)
Definition: Instructions.h:337
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Definition: Instructions.h:364
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition: StringMap.h:128
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: StringMap.h:253
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:700
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:147
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:150
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition: StringRef.h:451
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:277
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition: StringRef.h:616
Class to represent struct types.
Definition: DerivedTypes.h:218
static StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:612
Multiway switch.
void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(StringRef TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition: Triple.h:973
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition: Triple.h:1031
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition: Triple.h:1041
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
std::string str() const
Return the twine contents as a std::string.
Definition: Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
Type * getStructElementType(unsigned N) const
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isStructTy() const
True if this is an instance of StructType.
Definition: Type.h:258
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition: UnrollLoop.h:128
bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition: UnrollLoop.h:144
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
void setOperand(unsigned i, Value *Val)
Definition: User.h:233
Value * getOperand(unsigned i) const
Definition: User.h:228
ValueT lookup(const KeyT &Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: ValueMap.h:164
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
User * user_back()
Definition: Value.h:407
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:927
void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:542
User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition: Value.cpp:179
bool use_empty() const
Definition: Value.h:344
user_iterator user_end()
Definition: Value.h:405
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
iterator_range< use_iterator > uses()
Definition: Value.h:376
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:353
iterator insertAfter(iterator where, pointer New)
Definition: ilist.h:174
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Exit
Definition: COFF.h:845
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
void emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name, uint64_t Size, int32_t Flags, int32_t Data, StringRef SectionName)
Create an offloading section struct used to register this global at runtime.
Definition: Utility.cpp:77
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
Definition: OMPConstants.h:195
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
Definition: OMPConstants.h:252
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
Definition: OMPConstants.h:65
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
Definition: OMPConstants.h:45
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
WorksharingLoopType
A type of worksharing loop construct.
Definition: OMPConstants.h:283
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
Definition: OMPConstants.h:267
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
std::error_code getUniqueID(const Twine Path, UniqueID &Result)
Definition: Path.cpp:787
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:854
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:649
auto successors(const MachineBasicBlock *BB)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
Definition: BitcodeReader.h:66
bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
Definition: LoopPeel.cpp:870
void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
DWARFExpression::Operation Op
void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
@ Continue
Definition: DWP.h:21
void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
a struct to pack relevant information while generating atomic Ops
A struct to pack the relevant information for an OpenMP depend clause.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
Definition: OMPIRBuilder.h:645
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
Value * DynCGGroupMem
The size of the dynamic shared memory.
ArrayRef< Value * > NumThreads
The number of threads.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
Data structure to contain the information needed to uniquely identify a target entry.
Definition: OMPIRBuilder.h:203
static void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static const Target * lookupTarget(StringRef Triple, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...
Definition: OMPGridValues.h:57
unsigned GV_Warp_Size
The default value of maximum number of threads in a worker warp.
Definition: OMPGridValues.h:61